PK!Rb CHANGELOG.org* Change Log Early commits may not have entries because things move fast at that stage. ** v0.6 ** v0.4 ** v0.3 ** v0.2 ** v0.1 PK!u[{,,LICENSEMIT License Copyright (c) 2017 ADicksonLab Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!wepy/__init__.pyPK!wepy/analysis/__init__.pyPK!YTTwepy/analysis/contig_tree.pyimport itertools as it import networkx as nx import numpy as np from wepy.analysis.parents import DISCONTINUITY_VALUE, \ parent_panel, net_parent_table,\ ancestors, sliding_window class ContigTree(nx.DiGraph): RESAMPLING_PANEL_KEY = 'resampling_steps' PARENTS_KEY = 'parent_idxs' DISCONTINUITY_KEY = 'discontinuities' def __init__(self, wepy_h5, continuations=Ellipsis, runs=Ellipsis, boundary_condition_class=None, decision_class=None): super().__init__() self._wepy_h5 = wepy_h5 # we can optionally specify which continuations to use when # creating the contig tree instead of defaulting to the whole file self._continuations = set() self._run_idxs = set() # if specific runs were specified we add them right away, they # should be unique if runs is Ellipsis: self._run_idxs.update(self.wepy_h5.run_idxs) elif runs is not None: self._run_idxs.update(runs) # the continuations also give extra runs to incorporate into # this contig tree # if it is Ellipsis (...) then we include all runs and all the continuations if continuations is Ellipsis: self._run_idxs.update(self.wepy_h5.run_idxs) self._continuations.update([(a,b) for a, b in self.wepy_h5.continuations]) # otherwise we make the tree based on the runs in the # continuations elif continuations is not None: # the unique run_idxs self._run_idxs.update(it.chain(*self._continuations)) # the continuations themselves self._continuations.update([(a,b) for a, b in continuations]) # using the wepy_h5 create a tree of the cycles self._create_tree() self._set_resampling_panels() if decision_class is not None: self._set_parents(decision_class) if boundary_condition_class is not None: self._set_discontinuities(boundary_condition_class) def _create_tree(self): # first go through each run without continuations for run_idx in self._run_idxs: n_cycles = self.wepy_h5.run_n_cycles(run_idx) # make all the nodes for this run nodes = [(run_idx, step_idx) for step_idx in range(n_cycles)] self.add_nodes_from(nodes) # the same for the edges edge_node_idxs = list(zip(range(1, n_cycles), range(n_cycles - 1))) edges = [(nodes[a], nodes[b]) for a, b in edge_node_idxs] self.add_edges_from(edges) # after we have added all the nodes and edges for the run # subgraphs we need to connect them together with the # information in the contig tree. for edge_source, edge_target in self._continuations: # for the source node (the restart run) we use the run_idx # from the edge source node and the index of the first # cycle source_node = (edge_source, 0) # for the target node (the run being continued) we use the # run_idx from the edge_target and the last cycle index in # the run target_node = (edge_target, self.wepy_h5.run_n_cycles(edge_target)-1) # make the edge edge = (source_node, target_node) # add this connector edge to the network self.add_edge(*edge) def _set_resampling_panels(self): # then get the resampling tables for each cycle and put them # as attributes to the appropriate nodes for run_idx in self.run_idxs: run_resampling_panel = self.wepy_h5.run_resampling_panel(run_idx) # add each cycle of this panel to the network by adding # them in as nodes with the resampling steps first for step_idx, step in enumerate(run_resampling_panel): node = (run_idx, step_idx) self.nodes[node][self.RESAMPLING_PANEL_KEY] = step def _set_discontinuities(self, boundary_conditions_class): # initialize the attributes for discontinuities to 0s for no # discontinuities for node in self.nodes: n_walkers = len(self.node[node][self.PARENTS_KEY]) self.node[node][self.DISCONTINUITY_KEY] = [0 for i in range(n_walkers)] # for run_idx in self.run_idxs: # get the warping records for this run warping_records = self.wepy_h5.warping_records([run_idx]) # just the indices for checking stuff later warp_cycle_idxs = set([rec[0] for rec in warping_records]) # go through the nodes for node in self.nodes: node_run_idx = node[0] node_cycle_idx = node[1] # for a node which is in this run and has warp records if (node_run_idx == run_idx) and (node_cycle_idx in warp_cycle_idxs): # if there is then we want to apply the # warping records for this cycle to the # discontinuities for this cycle cycle_warp_records = [rec for rec in warping_records if (rec[0] == node_cycle_idx)] # go through each record and test if it is a # discontinuous warp for rec in cycle_warp_records: # index of the trajectory this warp effected rec_traj_idx = rec[1] # if it is discontinuous we need to mark that, # otherwise do nothing if boundary_conditions_class.warping_discontinuity(rec): self.node[node][self.DISCONTINUITY_KEY][rec_traj_idx] = -1 def _set_parents(self, decision_class): """Determines the net parents for each cycle and sets them in-place to the cycle tree given.""" # just go through each node individually in the tree for node in self.nodes: # get the records for each step in this node node_recs = self.node[node][self.RESAMPLING_PANEL_KEY] # get the node parent table by using the parent panel method # on the node records node_parent_panel = parent_panel(decision_class, [node_recs]) # then get the net parents from this parent panel, and slice # out the only entry from it node_parents = net_parent_table(node_parent_panel)[0] # put this back into the self self.nodes[node][self.PARENTS_KEY] = node_parents @property def run_idxs(self): return self._run_idxs @property def continuations(self): return self._continuations @property def wepy_h5(self): return self._wepy_h5 def contig_trace_to_run_trace(self, contig_trace, contig_walker_trace): """Given a trace of a contig with elements (run_idx, cycle_idx) and walker based trace of elements (traj_idx, cycle_idx) over that contig get the trace of elements (run_idx, traj_idx, cycle_idx) """ trace = [] for frame_idx, contig_el in enumerate(contig_trace): run_idx, cycle_idx = contig_el traj_idx = contig_walker_trace[frame_idx][0] frame = (run_idx, traj_idx, cycle_idx) trace.append(frame) return trace def contig_to_run_trace(self, contig, contig_walker_trace): """Convert a trace of elements (traj_idx, cycle_idx) over the contig trace given over this contig tree and return a trace over the runs with elements (run_idx, traj_idx, cycle_idx). """ # go through the contig and get the lengths of the runs that # are its components, and slice that many trace elements and # build up the new trace runs_trace = [] cum_n_frames = 0 for run_idx in contig: # number of frames in this run n_frames = self.wepy_h5.run_n_frames(run_idx) # get the contig trace elements for this run contig_trace_elements = contig_trace[cum_n_frames : n_frames + cum_n_frames] # convert the cycle_idxs to the run indexing and add a run_idx to each run_trace_elements = [(run_idx, traj_idx, cycle_idx - cum_n_frames) for traj_idx, contig_cycle_idx in contig_trace_elements] # add these to the trace runs_trace.extend(run_trace_elements) # then increase the cumulative n_frames for the next run cum_n_frames += n_frames return run_trace_elements def contig_cycle_idx(self, run_idx, cycle_idx): """Get the contig cycle idx for a (run_idx, cycle_idx) pair.""" # make the contig trace contig_trace = self.get_branch(run_idx, cycle_idx) # get the length and subtract one for the index return len(contig_trace) - 1 def get_branch(self, run_idx, cycle_idx, start_contig_idx=0): """Given an identifier of (run_idx, cycle_idx) from the contig tree and a starting contig index generate a contig trace of (run_idx, cycle_idx) indices for that contig. Which is a branch of the tree hence the name. """ assert start_contig_idx >= 0, "start_contig_idx must be a valid index" # initialize the current node curr_node = (run_idx, cycle_idx) # make a trace of this contig contig_trace = [curr_node] # stop the loop when we reach the root of the cycle tree at_root = False while not at_root: # first we get the cycle node previous to this one because it # has the parents for the current node parent_nodes = list(self.adj[curr_node].keys()) # if there is no parent then this is the root of the # cycle_tree and we should stop after this step of the loop if len(parent_nodes) == 0: at_root = True # or else we put the node into the contig trace else: parent_node = parent_nodes[0] contig_trace.insert(0, parent_node) # if there are any parents there should only be one, since it is a tree prev_node = parent_nodes[0] # then update the current node to the previous one curr_node = prev_node return contig_trace def trace_parent_table(self, contig_trace): """Given a contig trace returns a parent table for that contig. """ parent_table = [] for run_idx, cycle_idx in contig_trace: parent_idxs = self.node[(run_idx, cycle_idx)][self.PARENTS_KEY] parent_table.append(parent_idxs) return parent_table @classmethod def _tree_leaves(cls, root, tree): # traverse the tree away from the root node until a branch point # is found then iterate over the subtrees from there and # recursively call this function on them branch_child_nodes = [] curr_node = root leaves = [] leaf_found = False while (len(branch_child_nodes) == 0) and (not leaf_found): # get the child nodes child_nodes = list(tree.adj[curr_node].keys()) # if there is more than one child node, the current node is a # branch node if len(child_nodes) > 1: # we will use the branch child nodes as the roots of the # next recursion level branch_child_nodes = child_nodes # if there are no children then this is a leaf node elif len(child_nodes) == 0: # set the current node as the only leaf leaves = [curr_node] # and break out of the loop leaf_found = True # otherwise reset the current node else: # there will only be one child node curr_node = child_nodes[0] # this will run if any child nodes were found to find more leaves, # which won't happen when the loop ended upon finding a leaf node for branch_child_node in branch_child_nodes: branch_leaves = cls._tree_leaves(branch_child_node, tree) leaves.extend(branch_leaves) return leaves def _subtree_leaves(self, root): # get the subtree given the root subtree = self.get_subtree(root) # get the reversed directions of the cycle tree as a view, we # don't need a copy rev_tree = subtree.reverse(copy=False) # then we use the adjacencies to find the last node in the network # using a recursive algorithm leaves = self._tree_leaves(root, rev_tree) return leaves def leaves(self): """All of the leaves of the contig forest""" leaves = [] for root in self.roots(): subtree_leaves = self._subtree_leaves(root) leaves.extend(subtree_leaves) return leaves def _subtree_root(self, node): """ Given a node find the root of the tree it is on""" curr_node = node # we also get the adjacent nodes for this node adj_nodes = list(self.adj[curr_node].keys()) # there should only be one node in the dict assert len(adj_nodes) <= 1, "There should be at most 1 edge" # then we use this node as the starting point to move back to the # root, we end when there is no adjacent node while len(adj_nodes) > 0: # we take another step backwards, and choose the node in the # adjacency adj_nodes = list(self.adj[curr_node]) # there should only be 1 or none nodes assert len(adj_nodes) <= 1, "There should be at most 1 edge" # and reset the current node try: curr_node = adj_nodes[0] except IndexError: # this happens when this is the last node, inelegant apologies pass return curr_node def roots(self): subtree_roots = [] for subtree in self.subtrees(): # use a node from the subtree to get the root node = next(subtree.adjacency())[0] subtree_root = self._subtree_root(node) subtree_roots.append(subtree_root) return subtree_roots def subtrees(self): subtree_nxs = [] for component_nodes in nx.weakly_connected_components(self): # actually get the subtree from the main tree subtree = self.subgraph(component_nodes) subtree_nxs.append(subtree) return subtree_nxs def get_subtree(self, node): # get all the subtrees subtrees = self.subtrees() # see which tree the node is in for subtree in subtrees: # if the node is in it this is the subtree it is in so # just return it if node in subtree: return subtree def contig_sliding_windows(self, contig_trace, window_length): """Given a contig trace (run_idx, cycle_idx) get the sliding windows over it (traj_idx, cycle_idx).""" # make a parent table for the contig trace parent_table = self.trace_parent_table(contig_trace) # this gives you windows of trace elements (traj_idx, cycle_idx) windows = sliding_window(parent_table, window_length) return windows def sliding_contig_windows(self, window_length): assert window_length > 1, "window length must be greater than one" # we can deal with each tree in this forest of trees separately, # that is runs that are not connected contig_windows = [] for root in self.roots(): # get the contig windows for the individual tree subtree_contig_windows = self._subtree_sliding_contig_windows(root, window_length) contig_windows.extend(subtree_contig_windows) return contig_windows def _subtree_sliding_contig_windows(self, subtree_root, window_length): # to generate all the sliding windows over a connected cycle tree # it is useful to think of it as a braid, since within this tree # there is a forest of trees which are the lineages of the # walkers. To simplify things we can first generate window traces # over the cycle tree (ignoring the fine structure of the walkers), # which is a very similar process as to the original process of # the sliding windows over trees, and then treat each window trace # as it's own parent table, which can then be treated using the # original sliding window algorithm. As long as the correct contig # traces are generated this will be no problem, and all that is # left is to translate the cycle idxs properly such that the # windows generated are inter-run traces i.e. lists of tuples of # the form (run_idx, traj_idx, cycle_idx) where traj_idx and # cycle_idx are internal to the run specified by run_idx. # so we want to construct a list of contig windows, which are # traces with components (run_idx, cycle_idx) contig_windows = [] # to do this we start at the leaves of the cycle tree, but first # we have to find them. The cycle tree should be directed with # edges pointing towards parents. Since we are working with a # single tree, we have a single root and we can just reverse the # directions of the edges and recursively walk the tree until we # find nodes with no adjacent edges # first we need to find the leaves for this subtree leaves = self._subtree_leaves(subtree_root) # now we use these leaves to move backwards in the tree with the # window length to get contiguous segments contig_windows = [] # starting with the leaf nodes we generate contig trace windows # until the last nodes are the same as other windows from other # leaves, i.e. until a branch point has been arrived at between 2 # or more leaf branches. To do this we start at the leaf of the # longest spanning contig and make windows until the endpoint is # no longer the largest contig cycle index. Then we alternate # between them until we find they have converged # initialize the list of active branches all going back to the # root branch_contigs = [self.get_branch(*leaf) for leaf in leaves] done = False while not done: # make a window for the largest endpoint, no need to break # ties since the next iteration will get it contig_lengths = [len(contig) for contig in branch_contigs] longest_branch_idx = np.argmax(contig_lengths) # if the branch is not long enough for the window we end this # process if window_length > len(branch_contigs[longest_branch_idx]): done = True # otherwise we get the next window and do the other processing else: # get the next window for this branch window = branch_contigs[longest_branch_idx][-window_length:] contig_windows.append(window) # pop the last element off of this branch contig last_node = branch_contigs[longest_branch_idx].pop() # if there are any other branches of the same length that have # this as their last node then we have reached a branch point # and that other branch must be eliminated for branch_idx, branch_contig in enumerate(branch_contigs): # compare the last node in contig and if it is the same as # the node that was just used as a window end if branch_contig[-1] == last_node: # this branch is the same so we just get rid of it _ = branch_contigs.pop(branch_idx) return contig_windows def sliding_windows(self, window_length): """All the sliding windows (run_idx, traj_idx, cycle_idx) for all contig windows in the contig tree""" # get all of the contig traces for these trees contig_traces = self.sliding_contig_windows(window_length) # for each of these we generate all of the actual frame sliding windows windows = [] for contig_trace in contig_traces: # these are windows of trace element (traj_idx, cycle_idx) # all over this contig contig_windows = self.contig_sliding_windows(contig_trace, window_length) # convert those traces to the corresponding (run_idx, traj_idx, cycle_idx) # trace for contig_window in contig_windows: run_trace_window = self.contig_trace_to_run_trace(contig_trace, contig_window) windows.append(run_trace_window) return windows PK!+((wepy/analysis/network.pyfrom collections import defaultdict from copy import deepcopy import networkx as nx from wepy.analysis.transitions import transition_counts, counts_d_to_matrix, \ normalize_counts class MacroStateNetworkError(Exception): pass class MacroStateNetwork(): ASSIGNMENTS = 'assignments' def __init__(self, contig_tree, assg_field_key=None, assignments=None, transition_lag_time=2): self._graph = nx.DiGraph() assert not (assg_field_key is None and assignments is None), \ "either assg_field_key or assignments must be given" assert assg_field_key is not None or assignments is not None, \ "one of assg_field_key or assignments must be given" self._contig_tree = contig_tree self._wepy_h5 = self._contig_tree.wepy_h5 self._assg_field_key = assg_field_key # the temporary assignments dictionary self._node_assignments = None # and temporary raw assignments self._assignments = None # map the keys to their lists of assignments, depending on # whether or not we are using a field from the HDF5 traj or # assignments provided separately if assg_field_key is not None: assert type(assg_field_key) == str, "assignment key must be a string" self._key_init(assg_field_key) else: self._assignments_init(assignments) # once we have made th dictionary add the nodes to the network # and reassign the assignments to the nodes self._node_idxs = {} for node_idx, assg_item in enumerate(self._node_assignments.items()): assg_key, assigs = assg_item self._graph.add_node(assg_key, node_idx=node_idx, assignments=assigs) self._node_idxs[assg_key] = node_idx # then we compute the total weight of the macrostate and set # that as the default node weight #self.set_macrostate_weights() # now count the transitions between the states and set those # as the edges between nodes # first get the sliding window transitions from the contig # tree, once we set edges for a tree we don't really want to # have multiple sets of transitions on the same network so we # don't provide the method to add different assignments if transition_lag_time is not None: # set the lag time attribute self._transition_lag_time = transition_lag_time # get the transitions transitions = [] for window in self._contig_tree.sliding_windows(self._transition_lag_time): transition = [window[0], window[-1]] # convert the window trace on the contig to a trace # over the runs transitions.append(transition) # then get the counts for those edges counts_d = transition_counts(self._assignments, transitions) # create the edges and set the counts into them for edge, trans_counts in counts_d.items(): self._graph.add_edge(*edge, counts=trans_counts) # then we also want to get the transition probabilities so # we get the counts matrix and compute the probabilities # we first have to replace the keys of the counts of the # node_ids with the node_idxs node_id_to_idx_dict = self.node_id_to_idx_dict() self._countsmat = counts_d_to_matrix( {(node_id_to_idx_dict[edge[0]], node_id_to_idx_dict[edge[1]]) : counts for edge, counts in counts_d.items()}) self._probmat = normalize_counts(self._countsmat) # then we add these attributes to the edges in the network node_idx_to_id_dict = self.node_id_to_idx_dict() for i_id, j_id in self._graph.edges: # i and j are the node idxs so we need to get the # actual node_ids of them i_idx = node_idx_to_id_dict[i_id] j_idx = node_idx_to_id_dict[j_id] # convert to a normal float and set it as an explicitly named attribute self._graph.edges[i_id, j_id]['transition_probability'] = \ float(self._probmat[i_idx, j_idx]) # we also set the general purpose default weight of # the edge to be this. self._graph.edges[i_id, j_id]['Weight'] = \ float(self._probmat[i_idx, j_idx]) # then get rid of the assignments dictionary, this information # can be accessed from the network del self._node_assignments del self._assignments def _key_init(self, assg_field_key): # the key for the assignment in the wepy dataset self._assg_field_key = assg_field_key # blank assignments assignments = [[[] for traj_idx in range(self._wepy_h5.n_run_trajs(run_idx))] for run_idx in self._wepy_h5.run_idxs] # the raw assignments curr_run_idx = -1 for idx_tup, fields_d in self._wepy_h5.iter_trajs_fields( [self.assg_field_key], idxs=True): run_idx = idx_tup[0] traj_idx = idx_tup[1] assg_field = fields_d[self.assg_field_key] assignments[run_idx][traj_idx].extend(assg_field) # then just call the assignments constructor to do it the same # way self._assignments_init(assignments) def _assignments_init(self, assignments): # set the raw assignments to the temporary attribute self._assignments = assignments # this is the dictionary mapping node_id -> the (run_idx, traj_idx, cycle_idx) frames self._node_assignments = defaultdict(list) for run_idx, run in enumerate(assignments): for traj_idx, traj in enumerate(run): for frame_idx, assignment in enumerate(traj): self._node_assignments[assignment].append( (run_idx, traj_idx, frame_idx) ) def node_id_to_idx(self, assg_key): return self.node_id_to_idx_dict()[assg_key] def node_idx_to_id(self, node_idx): return self.node_idx_to_id_dict()[node_idx] def node_id_to_idx_dict(self): return self._node_idxs def node_idx_to_id_dict(self): # just reverse the dictionary and return return {node_idx : node_id for node_id, node_idx in self._node_idxs} @property def graph(self): return self._graph @property def contig_tree(self): return self._contig_tree @property def wepy_h5(self): return self._wepy_h5 @property def assg_field_key(self): return self._assg_field_key @property def countsmat(self): try: return self._countsmat except AttributeError: raise MacroStateNetworkError("transition counts matrix not calculated") @property def probmat(self): try: return self._probmat except AttributeError: raise MacroStateNetworkError("transition probability matrix not set") def get_node_attributes(self, node_id): pass def get_node_attribute(self, node_id, attribute_key): pass def node_assignments(self, node_id): return self.graph.nodes[node_id][self.ASSIGNMENTS] def get_node_fields(self, node_id, fields): node_trace = self.node_assignments(node_id) # use the node_trace to get the weights from the HDF5 fields_d = self.wepy_h5.get_trace_fields(node_trace, fields) return fields_d def iter_nodes_fields(self, fields): nodes_d = {} for node_id in self.graph.nodes: fields_d = self.get_node_fields(node_id, fields) nodes_d[node_id] = fields_d return nodes_d def set_nodes_field(self, key, values_dict): for node_id, value in values_dict.items(): self.graph.nodes[node_id][key] = value def node_map(self, func, *args, map_func, idxs=False, node_sel=None): pass def node_fields_map(self, func, fields, *args, map_func=map, idxs=False, node_sel=None): pass def compute_macrostate_attr(self, func, fields, *args, map_func=map, node_sel=None, idxs=False, attr_name=None, return_results=True): pass def microstate_weights(self): """Calculates and returns the sums of the weights of all the nodes as a dictionary mapping node_id -> frame weights""" node_weights = {} for node_id in self.graph.nodes: # get the trace of the frames in the node node_trace = self.node_assignments(node_id) # use the node_trace to get the weights from the HDF5 trace_weights = self.wepy_h5.get_trace_fields(node_trace, ['weights'])['weights'] node_weights[node_id] = trace_weights return node_weights def macrostate_weights(self): macrostate_weights = {} microstate_weights = self.microstate_weights() for node_id, weights in microstate_weights.items(): macrostate_weights[node_id] = float(sum(weights)[0]) return macrostate_weights def set_macrostate_weights(self): self.set_nodes_field('Weight', self.macrostate_weights()) def state_to_mdtraj(self, node_id, alt_rep=None): return self.wepy_h5.trace_to_mdtraj(self.node_assignments(node_id), alt_rep=alt_rep) def write_gexf(self, filepath): # to do this we need to get rid of the assignments in the # nodes though since this is not really supported or good to # store in a gexf file which is more for visualization as an # XML format, so we copy and modify then write the copy gexf_graph = deepcopy(self._graph) for node in gexf_graph: del gexf_graph.nodes[node][self.ASSIGNMENTS] nx.write_gexf(gexf_graph, filepath) PK!~__wepy/analysis/parents.pyfrom copy import copy import numpy as np DISCONTINUITY_VALUE = -1 def parent_panel(decision_class, resampling_panel): parent_panel = [] for cycle_idx, cycle in enumerate(resampling_panel): # each stage in the resampling for that cycle # make a stage parent table parent_table = [] # now iterate through the rest of the stages for step in cycle: # get the parents idxs for the children of this step step_parents = decision_class.parents(step) # for the full stage table save all the intermediate parents parent_table.append(step_parents) # for the full parent panel parent_panel.append(parent_table) return parent_panel def net_parent_table(parent_panel): net_parent_table = [] # each cycle for cycle_idx, step_parent_table in enumerate(parent_panel): # for the net table we only want the end results, # we start at the last cycle and look at its parent step_net_parents = [] n_steps = len(step_parent_table) for walker_idx, parent_idx in enumerate(step_parent_table[-1]): # initialize the root_parent_idx which will be updated root_parent_idx = parent_idx # if no resampling skip the loop and just return the idx if n_steps > 0: # go back through the steps getting the parent at each step for prev_step_idx in range(n_steps): prev_step_parents = step_parent_table[-(prev_step_idx+1)] root_parent_idx = prev_step_parents[root_parent_idx] # when this is done we should have the index of the root parent, # save this as the net parent index step_net_parents.append(root_parent_idx) # for this step save the net parents net_parent_table.append(step_net_parents) return net_parent_table def parent_table_discontinuities(boundary_condition_class, parent_table, warping_records): """Given a parent table and warping records returns a new parent table with the discontinuous warping events for parents set to -1""" # Make a copy of the parent table new_parent_table = copy(parent_table) # Find the number of walkers and cycles n_walker = np.shape(parent_table)[1] for rec_idx, warp_record in enumerate(warping_records): cycle_idx = warp_record[0] parent_idx = warp_record[1] # Check to see if any walkers in the current step # originated from this warped walker for walker_idx in range(n_walker): # if it's parent is the walker in this warping event # we also need to check to see if that warping event # was a discontinuous warping event if parent_table[cycle_idx][walker_idx] == parent_idx: # just check by using the method from the boundary # condition class used if boundary_condition_class.warping_discontinuity(warp_record): # set the element in the parent table to the # discontinuity value if it is new_parent_table[cycle_idx][walker_idx] = DISCONTINUITY_VALUE return new_parent_table def ancestors(parent_table, cycle_idx, walker_idx, ancestor_cycle=0): """Given a parent table, step_idx, and walker idx returns the ancestor at a given cycle of the walker. Input: parent: table (n_cycles x n_walkers numpy array): It describes how the walkers merged and cloned during the WExplore simulation . cycle_idx: walker_idx: ancestor_cycle: Output: ancestors: A list of 2x1 tuples indicating the walker and cycle parents """ lineage = [(walker_idx, cycle_idx)] previous_walker = walker_idx for curr_cycle_idx in range(cycle_idx-1, ancestor_cycle-1, -1): previous_walker = parent_table[curr_cycle_idx][previous_walker] # check for discontinuities, e.g. warping events if previous_walker == -1: # there are no more continuous ancestors for this # walker so we cannot return ancestors back to the # requested cycle just return the ancestors to this # point break previous_point = (previous_walker, curr_cycle_idx) lineage.insert(0, previous_point) return lineage def sliding_window(parent_table, window_length): """Returns traces (lists of frames across a run) on a sliding window on the branching structure of a run of a WepyHDF5 file. There is no particular order guaranteed. """ # assert parent_table.dtype == np.int, \ # "parent table values must be integers, not {}".format(parent_table.dtype) assert window_length > 1, "window length must be greater than one" windows = [] # we make a range iterator which goes from the last cycle to the # cycle which would be the end of the first possible sliding window for cycle_idx in range(len(parent_table)-1, window_length-2, -1): # then iterate for each walker at this cycle for walker_idx in range(len(parent_table[0])): # then get the ancestors according to the sliding window window = ancestors(parent_table, cycle_idx, walker_idx, ancestor_cycle=cycle_idx-(window_length-1)) # if the window is too short because the lineage has a # discontinuity in it skip to the next window if len(window) < window_length: continue windows.append(window) return windows PK!?o<wepy/analysis/transitions.pyimport itertools as it from collections import defaultdict import numpy as np def transition_counts(assignments, transitions): """Make a dictionary of transition counts. assignments: a list of [N_run, [N_traj x N_cycle]] arrays of ints where N_runs is the number of runs, N_traj is the number of trajectories, and N_cycle is the number of cycles transitions: list of traces (a trace is a list of tuples specifying the run, trajectory, and frame). """ # for each transition (start, end) (don't permute) and count them # up in a dictionary countsmat_d = defaultdict(int) for transition in transitions: start = transition[0] end = transition[-1] # get the assignments for the transition start_assignment = assignments[start[0]][start[1]][start[2]] end_assignment = assignments[end[0]][end[1]][end[2]] countsmat_d[(start_assignment, end_assignment)] += 1 return countsmat_d def counts_d_to_matrix(counts_d): # get the number of unique nodes in the counts_d max_assignment = max(it.chain(*counts_d.keys())) countsmat = np.zeros((max_assignment+1, max_assignment+1)) for transition, n_trans in counts_d.items(): countsmat[transition] = n_trans return countsmat def normalize_counts(transition_counts_matrix): return np.divide(transition_counts_matrix, transition_counts_matrix.sum(axis=0)) def transition_counts_matrix(assignments, transitions): """Make a transition count matrix for a single run. assignments: a list of N_run, [N_traj x N_cycle] arrays of ints where N_runs is the number of runs, N_traj is the number of trajectories, and N_cycle is the number of cycles transitions: list of traces (a trace is a list of tuples specifying the run, trajectory, and frame). """ # count them and return as a dictionary countsmat_d = transition_counts(assignments, transitions) # convert to matrix countsmat = counts_d_to_matrix(countsmat_d) return countsmat def transition_probability_matrix(assignments, transitions): """ This determines a transition matrix for a variable lag time. Inputs: assignments : (numpy array [n_traj x n_timestep]): This is an array that indicates the cluster number for each traj at each timestep. sliding_window(iterable) : list of transitions. Transitions are a tuple of the start and end frame for a transition. Start and end frames are given by (traj_idx, frame_idx). Outputs: trans_prob_mat (numpy array [n_cluster x n_cluster]): A transition probability matrix. """ # get the counts trans_counts_mat = transition_counts_matrix(assignments, transitions) # normalize to get the transition probabilities trans_prob_mat = normalize_counts(trans_counts_mat) return trans_prob_mat def run_transition_counts_matrix(wepy_hdf5, run_idx, assignment_key, transitions): """Make a transition counts matrix from a WepyHDF5 run for a particular assignment given a set of transitions. """ total_counts_d = defaultdict(int) max_assignment = 0 for transition in transitions: start = transition[0] end = transition[-1] # Gets cluster pair from the hdf5 file assignments = wepy_hdf5.get_run_trace_fields(run_idx, [start, end], [assignment_key])[assignment_key] # Add a count to the cluster pair in the dictionary total_counts_d[(assignments[0], assignments[1])] += 1 # If the assignment index is higher than previously seen # assignments, update the max_assignment max_assg = max(assignments) if max_assg > max_assignment: max_assignment = max_assg # make a matrix of the counts counts_matrix = np.zeros((max_assignment+1, max_assignment+1)) for transition, n_trans in total_counts_d.items(): counts_matrix[transition] = n_trans return counts_matrix def run_transition_probability_matrix(wepy_hdf5, run_idx, assignment_key, transitions): """Make a transition probability matrix from a WepyHDF5 run for a particular assignment given a set of transitions. """ # get the counts for the run counts_mat = run_transition_counts_matrix(wepy_hdf5, run_idx, assignment_key, transitions) # normalize to get the probabilities trans_prob_matrix = normalize_counts(counts_mat) return trans_prob_matrix PK!$wepy/boundary_conditions/__init__.pyPK!I $wepy/boundary_conditions/boundary.pyimport sys import logging import numpy as np from wepy.walker import Walker class BoundaryConditions(object): # records of boundary condition changes (sporadic) BC_FIELDS = () BC_SHAPES = () BC_DTYPES = () BC_RECORD_FIELDS = () # warping (sporadic) WARPING_FIELDS = () WARPING_SHAPES = () WARPING_DTYPES = () WARPING_RECORD_FIELDS = () # progress towards the boundary conditions (continual) PROGRESS_FIELDS = () PROGRESS_SHAPES = () PROGRESS_DTYPES = () PROGRESS_RECORD_FIELDS = () def __init__(self, **kwargs): pass def bc_field_names(self): return self.BC_FIELDS def bc_field_shapes(self): return self.BC_SHAPES def bc_field_dtypes(self): return self.BC_DTYPES def bc_fields(self): return list(zip(self.bc_field_names(), self.bc_field_shapes(), self.bc_field_dtypes())) def bc_record_field_names(self): return self.BC_RECORD_FIELDS def warping_field_names(self): return self.WARPING_FIELDS def warping_field_shapes(self): return self.WARPING_SHAPES def warping_field_dtypes(self): return self.WARPING_DTYPES def warping_fields(self): return list(zip(self.warping_field_names(), self.warping_field_shapes(), self.warping_field_dtypes())) def warping_record_field_names(self): return self.WARPING_RECORD_FIELDS def progress_field_names(self): return self.PROGRESS_FIELDS def progress_field_shapes(self): return self.PROGRESS_SHAPES def progress_field_dtypes(self): return self.PROGRESS_DTYPES def progress_fields(self): return list(zip(self.progress_field_names(), self.progress_field_shapes(), self.progress_field_dtypes())) def progress_record_field_names(self): return self.PROGRESS_RECORD_FIELDS def progress(self, walker): """ Checks if a walker is in a boundary and returns which boundary it is in""" raise NotImplementedError def warp_walkers(self, walkers): """Checks walkers for membership in boundaries and processes them according to the rules of the boundary.""" raise NotImplementedError class NoBC(BoundaryConditions): def check_boundaries(self, walker): return False, {} def warp_walkers(self, walkers): # in order the walkers after applying warps: # warping, bc, progress warp_data = {} bc_data = {} progress_data = {} return walkers, warp_data, bc_data, progress_data PK! W 66%wepy/boundary_conditions/rebinding.pyimport sys import itertools as it from collections import defaultdict from random import random import logging import numpy as np import numpy.linalg as la from numpy.random import choice import mdtraj as mdj from wepy.boundary_conditions.boundary import BoundaryConditions from wepy.resampling.distances.openmm import OpenMMRebindingDistance class RebindingBC(BoundaryConditions): WARP_INSTRUCT_DTYPE = np.dtype([('target', int)]) WARP_AUX_DTYPES = {'cycle' : np.int, 'passage_time' : np.float, 'warped_walker_weight' : np.float} WARP_AUX_SHAPES = {'cycle' : (1,), 'passage_time' : (1,), 'warped_walker_weight' : (1,)} def __init__(self, initial_states=None, initial_weights=None, cutoff_distance=0.2, topology=None, ligand_idxs=None, binding_site_idxs=None, comp_xyz=None, alternative_maps=None): # test input assert initial_states is not None, "Must give a set of initial states" assert topology is not None, "Must give a reference topology" assert comp_xyz is not None, "Must give coordinates for bound state" assert ligand_idxs is not None assert binding_site_idxs is not None assert type(cutoff_distance) is float self.initial_states = initial_states if initial_weights is None: self.initial_weights = np.array([1] * len(initial_states)) else: self.initial_weights = initial_weights self.cutoff_distance = cutoff_distance self.topology = topology self.native_distance = OpenMMRebindingDistance(topology=topology, ligand_idxs=ligand_idxs, binding_site_idxs=binding_site_idxs, alt_maps=alternative_maps, comp_xyz=comp_xyz) def check_boundaries(self, nat_rmsd): # test to see if the ligand is re-bound rebound = False if nat_rmsd <= self.cutoff_distance: rebound = True boundary_data = {'nat_rmsd' : nat_rmsd} return rebound, boundary_data def warp(self, walker, cycle): # choose a state randomly from the set of initial states warped_state = choice(self.initial_states, 1, p=self.initial_weights/np.sum(self.initial_weights))[0] # set the initial state into a new walker object with the same weight warped_walker = type(walker)(state=warped_state, weight=walker.weight) # thus there is only one record warp_record = (0,) # collect the passage time # time is returned as an array because it is a feature of the # walker, and domain specific. I.e. domain specific values are # of type `array` while weights will always be floats in all # applications. time = walker.time_value() warp_data = {'cycle' : np.array([cycle]), 'passage_time' : time, 'warped_walker_weight' : np.array([walker.weight])} # make the warp data mapping return warped_walker, warp_record, warp_data def warp_walkers(self, walkers, cycle): new_walkers = [] warped_walkers_records = [] cycle_bc_records = [] # boundary data is collected for each walker every cycle cycle_boundary_data = defaultdict(list) # warp data is collected each time a warp occurs cycle_warp_data = defaultdict(list) native_rmsds = self.native_distance.get_rmsd_native(walkers) for walker_idx, walker in enumerate(walkers): # check if it is unbound, also gives the minimum distance # between guest and host rebound, boundary_data = self.check_boundaries(native_rmsds[walker_idx]) # add boundary data for this walker for key, value in boundary_data.items(): cycle_boundary_data[key].append(value) # if the walker is unbound we need to warp it if rebound: # warp the walker warped_walker, warp_record, warp_data = self.warp(walker,cycle) # save warped_walker in the list of new walkers to return new_walkers.append(warped_walker) # save the record of the walker warped_walkers_records.append( (walker_idx, warp_record) ) # save warp data for key, value in warp_data.items(): cycle_warp_data[key].append(value) logging.info('REBINDING observed at {}'.format( warp_data['passage_time'])) logging.info('Warped Walker Weight = {}'.format( warp_data['warped_walker_weight'])) # no warping so just return the original walker else: new_walkers.append(walker) # convert aux datas to np.arrays for key, value in cycle_warp_data.items(): cycle_warp_data[key] = np.array(value) for key, value in cycle_boundary_data.items(): cycle_boundary_data[key] = np.array(value) return new_walkers, warped_walkers_records, cycle_warp_data, \ cycle_bc_records, cycle_boundary_data PK!~  %wepy/boundary_conditions/unbinding.pyimport sys import itertools as it from collections import defaultdict from copy import copy import logging import numpy as np import numpy.linalg as la import mdtraj as mdj from wepy.boundary_conditions.boundary import BoundaryConditions class UnbindingBC(BoundaryConditions): # records of boundary condition changes (sporadic) BC_FIELDS = ('boundary_distance', ) BC_SHAPES = ((1,), ) BC_DTYPES = (np.float, ) BC_RECORD_FIELDS = ('boundary_distance', ) # warping (sporadic) WARPING_FIELDS = ('walker_idx', 'target_idx', 'weight') WARPING_SHAPES = ((1,), (1,), (1,)) WARPING_DTYPES = (np.int, np.int, np.float) WARPING_RECORD_FIELDS = ('walker_idx', 'target_idx', 'weight') # progress towards the boundary conditions (continual) PROGRESS_FIELDS = ('min_distances',) PROGRESS_SHAPES = (Ellipsis,) PROGRESS_DTYPES = (np.float,) PROGRESS_RECORD_FIELDS = ('min_distances', ) # for boundary conditions that warp things around certain targets # mayb not introduce discontiuities, these target idxs do DISCONTINUITY_TARGET_IDXS = (0,) def __init__(self, initial_state=None, cutoff_distance=1.0, topology=None, ligand_idxs=None, receptor_idxs=None): super().__init__() # test input assert initial_state is not None, "Must give an initial state" assert topology is not None, "Must give a reference topology" assert ligand_idxs is not None assert receptor_idxs is not None assert type(cutoff_distance) is float self.initial_state = initial_state self.cutoff_distance = cutoff_distance self.topology = topology self.ligand_idxs = ligand_idxs self.receptor_idxs = receptor_idxs def _calc_angle(self, v1, v2): return np.degrees(np.arccos(np.dot(v1, v2)/(la.norm(v1) * la.norm(v2)))) def _calc_length(self, v): return la.norm(v) def _calc_min_distance(self, walker): # convert box_vectors to angles and lengths for mdtraj # calc box length cell_lengths = np.array([[self._calc_length(v) for v in walker.state['box_vectors']]]) # TODO order of cell angles # calc angles cell_angles = np.array([[self._calc_angle(walker.state['box_vectors'][i], walker.state['box_vectors'][j]) for i, j in [(0,1), (1,2), (2,0)]]]) # make a traj out of it so we can calculate distances through # the periodic boundary conditions walker_traj = mdj.Trajectory(walker.state['positions'], topology=self.topology, unitcell_lengths=cell_lengths, unitcell_angles=cell_angles) # calculate the distances through periodic boundary conditions # and get hte minimum distance min_distance = np.min(mdj.compute_distances(walker_traj, it.product(self.ligand_idxs, self.receptor_idxs))) return min_distance def progress(self, walker): min_distance = self._calc_min_distance(walker) # test to see if the ligand is unbound unbound = False if min_distance >= self.cutoff_distance: unbound = True progress_data = {'min_distances' : min_distance} return unbound, progress_data def warp(self, walker): # we always start at the initial state warped_state = self.initial_state # set the initial state into a new walker object with the same # weight warped_walker = type(walker)(state=warped_state, weight=walker.weight) # thus there is only value for a record target_idx = 0 # the data for the warp warp_data = {'target_idx' : np.array([target_idx]), 'weight' : np.array([walker.weight])} return warped_walker, warp_data def update_bc(self, new_walkers, warp_data, progress_data, cycle): # TODO just for testing if this works. only report a record on # the first cycle which gives the distance at which walkers # are warped if cycle == 0: return [{'boundary_distance' : np.array([self.cutoff_distance]),},] else: return [] def warp_walkers(self, walkers, cycle): new_walkers = [] # sporadic, zero or many records per call warp_data = [] bc_data = [] # continual, one record per call progress_data = defaultdict(list) for walker_idx, walker in enumerate(walkers): # check if it is unbound, also gives the minimum distance # between guest and host unbound, walker_progress_data = self.progress(walker) # add that to the progress data record for key, value in walker_progress_data.items(): progress_data[key].append(value) # if the walker is unbound we need to warp it if unbound: # warp the walker warped_walker, walker_warp_data = self.warp(walker) # add the walker idx to the walker warp record walker_warp_data['walker_idx'] = np.array([walker_idx]) # save warped_walker in the list of new walkers to return new_walkers.append(warped_walker) # save the instruction record of the walker warp_data.append(walker_warp_data) logging.info('EXIT POINT observed at {}'.format(cycle)) logging.info('Warped Walker Weight = {}'.format( walker_warp_data['weight'])) # no warping so just return the original walker else: new_walkers.append(walker) # consolidate the progress data to an array of a single # feature vectors for the cycle for key, value in progress_data.items(): progress_data[key] = value # if the boundary conditions need to be updated given the # cycle and state from warping perform that now and return any # record data for that bc_data = self.update_bc(new_walkers, warp_data, progress_data, cycle) return new_walkers, warp_data, bc_data, progress_data @classmethod def warping_discontinuity(cls, warping_record): """Given a warping record returns either True for a discontiuity occured or False if a discontinuity did not occur.""" # the target_idxs are one of the discontinuous targets if warping_record[2] in cls.DISCONTINUITY_TARGET_IDXS: return True else: return False PK!1JJ wepy/hdf5.pyimport os.path as osp from collections import Sequence, namedtuple import itertools as it import json from warnings import warn from copy import copy import logging import numpy as np import h5py import networkx as nx from wepy.util.mdtraj import mdtraj_to_json_topology, json_to_mdtraj_topology from wepy.util.util import traj_box_vectors_to_lengths_angles, json_top_atom_count # optional dependencies try: import mdtraj as mdj except ModuleNotFoundError: warn("mdtraj is not installed and that functionality will not work", RuntimeWarning) try: import pandas as pd except ModuleNotFoundError: warn("pandas is not installed and that functionality will not work", RuntimeWarning) ## Constants for the main trajectories data group # Constants N_DIMS = 3 TRAJECTORIES = 'trajectories' # strings for trajectory fields POSITIONS = 'positions' BOX_VECTORS = 'box_vectors' VELOCITIES = 'velocities' FORCES = 'forces' TIME = 'time' KINETIC_ENERGY = 'kinetic_energy' POTENTIAL_ENERGY = 'potential_energy' BOX_VOLUME = 'box_volume' OBSERVABLES = 'observables' PARAMETERS = 'parameters' PARAMETER_DERIVATIVES = 'parameter_derivatives' WEIGHTS = 'weights' # lists of keys etc. TRAJ_DATA_FIELDS = (POSITIONS, TIME, BOX_VECTORS, VELOCITIES, FORCES, KINETIC_ENERGY, POTENTIAL_ENERGY, BOX_VOLUME, PARAMETERS, PARAMETER_DERIVATIVES, OBSERVABLES) # defaults for the rank (length of shape vector) for certain # unchanging data fields. This is the rank of the feawture not the # array that will acutally be saved int he hdf5. That will always be # one more than the rank of the feature. FIELD_FEATURE_RANKS = ((POSITIONS, 2), (TIME, 1), (BOX_VECTORS, 2), (VELOCITIES, 2), (FORCES, 2), (BOX_VOLUME, 1), (KINETIC_ENERGY, 1), (POTENTIAL_ENERGY, 1), ) # defaults for the shapes for those fields they can be given to. FIELD_FEATURE_SHAPES = ((TIME, (1,)), (BOX_VECTORS, (3,3)), (BOX_VOLUME, (1,)), (KINETIC_ENERGY, (1,)), (POTENTIAL_ENERGY, (1,)), ) WEIGHT_SHAPE = (1,) FIELD_FEATURE_DTYPES = ((POSITIONS, np.float), (VELOCITIES, np.float), (FORCES, np.float), (TIME, np.float), (BOX_VECTORS, np.float), (BOX_VOLUME, np.float), (KINETIC_ENERGY, np.float), (POTENTIAL_ENERGY, np.float), ) # Positions (and thus velocities and forces) are determined by the # N_DIMS (which can be customized) and more importantly the number of # particles which is always different. All the others are always wacky # and different. POSITIONS_LIKE_FIELDS = (VELOCITIES, FORCES) # some fields have more than one dataset associated with them COMPOUND_DATA_FIELDS = (PARAMETERS, PARAMETER_DERIVATIVES, OBSERVABLES) COMPOUND_UNIT_FIELDS = (PARAMETERS, PARAMETER_DERIVATIVES, OBSERVABLES) # some fields can have sparse data, non-sparse data must be all the # same shape and larger than sparse arrays. Sparse arrays have an # associated index with them aligning them to the non-sparse datasets SPARSE_DATA_FIELDS = (VELOCITIES, FORCES, KINETIC_ENERGY, POTENTIAL_ENERGY, BOX_VOLUME, PARAMETERS, PARAMETER_DERIVATIVES, OBSERVABLES) ## Run data records # the groups of run records RESAMPLING = 'resampling' RESAMPLER = 'resampler' WARPING = 'warping' PROGRESS = 'progress' BC = 'boundary_conditions' CYCLE_IDXS = '_cycle_idxs' # records can be sporadic or continual. Continual records are # generated every cycle and are saved every cycle and are for all # walkers. Sporadic records are generated conditional on specific # events taking place and thus may or may not be produced each # cycle. There also is not a single record for each (cycle, step) like # there would be for continual ones because they can occur for single # walkers, boundary conditions, or resamplers. SPORADIC_RECORDS = (RESAMPLER, WARPING, RESAMPLING, BC) ## Dataset Compliances # a file which has different levels of keys can be used for # different things, so we define these collections of keys, # and flags to keep track of which ones this dataset # satisfies, ds here stands for "dataset" COMPLIANCE_TAGS = ['COORDS', 'TRAJ', 'RESTART'] # the minimal requirement (and need for this class) is to associate # a collection of coordinates to some molecular structure (topology) COMPLIANCE_REQUIREMENTS = (('COORDS', (POSITIONS,)), ('TRAJ', (POSITIONS, TIME, BOX_VECTORS)), ('RESTART', (POSITIONS, TIME, BOX_VECTORS, VELOCITIES)), ) class WepyHDF5(object): def __init__(self, filename, topology=None, mode='x', units=None, sparse_fields=None, feature_shapes=None, feature_dtypes=None, n_dims=None, alt_reps=None, main_rep_idxs=None, expert_mode=False ): """Initialize a new Wepy HDF5 file. This is a file that organizes wepy.TrajHDF5 dataset subsets by simulations by runs and includes resampling records for recovering walker histories. mode: r Readonly, file must exist r+ Read/write, file must exist w Create file, truncate if exists x or w- Create file, fail if exists a Read/write if exists, create otherwise """ self._filename = filename if expert_mode is True: self._h5 = None self._wepy_mode = None self._h5py_mode = None self.closed = None # terminate the constructor here return None assert mode in ['r', 'r+', 'w', 'w-', 'x', 'a', 'c', 'c-'], \ "mode must be either 'r', 'r+', 'w', 'x', 'w-', 'a', 'c', or 'c-'" # the top level mode enforced by wepy.hdf5 self._wepy_mode = mode # the lower level h5py mode. THis was originally different to # accomodate different modes at teh wepy level for # concatenation. I will leave these separate because this is # used elsewhere and could be a feature in the future. self._h5py_mode = mode # Temporary metadata: used to initialize the object but not # used after that self._topology = topology self._units = units self._n_dims = n_dims self._n_coords = None # set hidden feature shapes and dtype, which are only # referenced if needed when trajectories are created. These # will be saved in the settings section in the actual HDF5 # file self._field_feature_shapes_kwarg = feature_shapes self._field_feature_dtypes_kwarg = feature_dtypes self._field_feature_dtypes = None self._field_feature_shapes = None # save the sparse fields as a private variable for use in the # create constructor if sparse_fields is None: self._sparse_fields = [] else: self._sparse_fields = sparse_fields # if we specify an atom subset of the main POSITIONS field # we must save them self._main_rep_idxs = main_rep_idxs # a dictionary specifying other alt_reps to be saved if alt_reps is not None: self._alt_reps = alt_reps # all alt_reps are sparse alt_rep_keys = ['alt_reps/{}'.format(key) for key in self._alt_reps.keys()] self._sparse_fields.extend(alt_rep_keys) else: self._alt_reps = {} # open the file and then run the different constructors based # on the mode with h5py.File(filename, mode=self._h5py_mode) as h5: self._h5 = h5 # create file mode: 'w' will create a new file or overwrite, # 'w-' and 'x' will not overwrite but will create a new file if self._wepy_mode in ['w', 'w-', 'x']: self._create_init() # read/write mode: in this mode we do not completely overwrite # the old file and start again but rather write over top of # values if requested elif self._wepy_mode in ['r+']: self._read_write_init() # add mode: read/write create if doesn't exist elif self._wepy_mode in ['a']: if osp.exists(self._filename): self._read_write_init() else: self._create_init() # read only mode elif self._wepy_mode == 'r': # if any data was given, warn the user if any([kwarg is not None for kwarg in [topology, units, sparse_fields, feature_shapes, feature_dtypes, n_dims, alt_reps, main_rep_idxs]]): warn("Data was given but opening in read-only mode", RuntimeWarning) # then run the initialization process self._read_init() # flush the buffers self._h5.flush() # set the h5py mode to the value in the actual h5py.File # object after creation self._h5py_mode = self._h5.mode # get rid of the temporary variables del self._topology del self._units del self._n_dims del self._n_coords del self._field_feature_shapes_kwarg del self._field_feature_dtypes_kwarg del self._field_feature_shapes del self._field_feature_dtypes del self._sparse_fields del self._main_rep_idxs del self._alt_reps # variable to reflect if it is closed or not, should be closed # after initialization self.closed = True # end of the constructor return None # context manager methods def __enter__(self): self._h5 = h5py.File(self._filename) self.closed = False return self def __exit__(self, exc_type, exc_value, exc_tb): self._h5.flush() self.close() # custom deepcopy to avoid copying the actual HDF5 object # constructors def _create_init(self): """Completely overwrite the data in the file. Reinitialize the values and set with the new ones if given.""" assert self._topology is not None, \ "Topology must be given for a creation constructor" # initialize the runs group runs_grp = self._h5.create_group('runs') # initialize the settings group settings_grp = self._h5.create_group('_settings') # create the topology dataset self._h5.create_dataset('topology', data=self._topology) # sparse fields if self._sparse_fields is not None: # make a dataset for the sparse fields allowed. this requires # a 'special' datatype for variable length strings. This is # supported by HDF5 but not numpy. vlen_str_dt = h5py.special_dtype(vlen=str) # create the dataset with empty values for the length of the # sparse fields given sparse_fields_ds = settings_grp.create_dataset('sparse_fields', (len(self._sparse_fields),), dtype=vlen_str_dt, maxshape=(None,)) # set the flags for i, sparse_field in enumerate(self._sparse_fields): sparse_fields_ds[i] = sparse_field # field feature shapes and dtypes # initialize to the defaults, this gives values to # self._n_coords, and self.field_feature_dtypes, and # self.field_feature_shapes self._set_default_init_field_attributes(n_dims=self._n_dims) # save the number of dimensions and number of atoms in settings settings_grp.create_dataset('n_dims', data=np.array(self._n_dims)) settings_grp.create_dataset('n_atoms', data=np.array(self._n_coords)) # the main rep atom idxs settings_grp.create_dataset('main_rep_idxs', data=self._main_rep_idxs, dtype=np.int) # alt_reps settings alt_reps_idxs_grp = settings_grp.create_group("alt_reps_idxs") for alt_rep_name, idxs in self._alt_reps.items(): alt_reps_idxs_grp.create_dataset(alt_rep_name, data=idxs, dtype=np.int) # if both feature shapes and dtypes were specified overwrite # (or initialize if not set by defaults) the defaults if (self._field_feature_shapes_kwarg is not None) and\ (self._field_feature_dtypes_kwarg is not None): self._field_feature_shapes.update(self._field_feature_shapes_kwarg) self._field_feature_dtypes.update(self._field_feature_dtypes_kwarg) # any sparse field with unspecified shape and dtype must be # set to None so that it will be set at runtime for sparse_field in self.sparse_fields: if (not sparse_field in self._field_feature_shapes) or \ (not sparse_field in self._field_feature_dtypes): self._field_feature_shapes[sparse_field] = None self._field_feature_dtypes[sparse_field] = None # save the field feature shapes and dtypes in the settings group shapes_grp = settings_grp.create_group('field_feature_shapes') for field_path, field_shape in self._field_feature_shapes.items(): if field_shape is None: # set it as a dimensionless array of NaN field_shape = np.array(np.nan) shapes_grp.create_dataset(field_path, data=field_shape) dtypes_grp = settings_grp.create_group('field_feature_dtypes') for field_path, field_dtype in self._field_feature_dtypes.items(): if field_dtype is None: dt_str = 'None' else: # make a json string of the datatype that can be read # in again, we call np.dtype again because there is no # np.float.descr attribute dt_str = json.dumps(np.dtype(field_dtype).descr) dtypes_grp.create_dataset(field_path, data=dt_str) # initialize the units group unit_grp = self._h5.create_group('units') # if units were not given set them all to None if self._units is None: self._units = {} for field_path in self._field_feature_shapes.keys(): self._units[field_path] = None # set the units for field_path, unit_value in self._units.items(): # ignore the field if not given if unit_value is None: continue unit_path = '/units/{}'.format(field_path) unit_grp.create_dataset(unit_path, data=unit_value) # create the group for the run data records records_grp = settings_grp.create_group('record_fields') # create a dataset for the continuation run tuples # (continuation_run, base_run), where the first element # of the new run that is continuing the run in the second # position self._init_continuations() def _read_write_init(self): """Write over values if given but do not reinitialize any old ones. """ self._read_init() def _add_init(self): """Create the dataset if it doesn't exist and put it in r+ mode, otherwise, just open in r+ mode.""" if not any(self._exist_flags): self._create_init() else: self._read_write_init() def _read_init(self): """Read only initialization currently has nothing to do.""" pass def _get_field_path_grp(self, run_idx, traj_idx, field_path): """Given a field path for the trajectory returns the group the field's dataset goes in and the key for the field name in that group. The field path for a simple field is just the name of the field and for a compound field it is the compound field group name with the subfield separated by a '/' like 'observables/observable1' where 'observables' is the compound field group and 'observable1' is the subfield name. """ # check if it is compound if '/' in field_path: # split it grp_name, field_name = field_path.split('/') # get the hdf5 group grp = self.h5['runs/{}/trajectories/{}/{}'.format(run_idx, traj_idx, grp_name)] # its simple so just return the root group and the original path else: grp = self.h5 field_name = field_path return grp, field_name def _set_default_init_field_attributes(self, n_dims=None): """Sets the feature_shapes and feature_dtypes to be the default for this module. These will be used to initialize field datasets when no given during construction (i.e. for sparse values)""" # we use the module defaults for the datasets to initialize them field_feature_shapes = dict(FIELD_FEATURE_SHAPES) field_feature_dtypes = dict(FIELD_FEATURE_DTYPES) # get the number of coordinates of positions. If there is a # main_reps then we have to set the number of atoms to that, # if not we count the number of atoms in the topology if self._main_rep_idxs is None: self._n_coords = json_top_atom_count(self.topology) self._main_rep_idxs = list(range(self._n_coords)) else: self._n_coords = len(self._main_rep_idxs) # get the number of dimensions as a default if n_dims is None: self._n_dims = N_DIMS # feature shapes for positions and positions-like fields are # not known at the module level due to different number of # coordinates (number of atoms) and number of dimensions # (default 3 spatial). We set them now that we know this # information. # add the postitions shape field_feature_shapes[POSITIONS] = (self._n_coords, self._n_dims) # add the positions-like field shapes (velocities and forces) as the same for poslike_field in POSITIONS_LIKE_FIELDS: field_feature_shapes[poslike_field] = (self._n_coords, self._n_dims) # set the attributes self._field_feature_shapes = field_feature_shapes self._field_feature_dtypes = field_feature_dtypes @property def filename(self): return self._filename def open(self): if self.closed: self._h5 = h5py.File(self._filename, self._h5py_mode) self.closed = False else: raise IOError("This file is already open") def close(self): if not self.closed: self._h5.close() self.closed = True # TODO is this right? shouldn't we actually delete the data then close def __del__(self): self.close() def clone(self, path, mode='x'): """Clones this WepyHDF5 file without any of the actual runs and run data. This includes the topology, units, sparse_fields, feature shapes and dtypes, alt_reps, and main representation information. This method will flush the buffers for this file. Does not preserve metadata pertaining to inter-run relationships like continuations. """ assert mode in ['w', 'w-', 'x'], "must be opened in a file creation mode" # we manually construct an HDF5 and copy the groups over new_h5 = h5py.File(path, mode=mode) new_h5.create_group('runs') # flush the datasets buffers self.h5.flush() new_h5.flush() # copy the existing datasets to the new one h5py.h5o.copy(self._h5.id, b'topology', new_h5.id, b'topology') h5py.h5o.copy(self._h5.id, b'units', new_h5.id, b'units') h5py.h5o.copy(self._h5.id, b'_settings', new_h5.id, b'_settings') # for the settings we need to get rid of the data for interun # relationships like the continuations, so we reinitialize the # continuations self._init_continuations() # now make a WepyHDF5 object in "expert_mode" which means it # is just empy and we construct it manually, "surgically" as I # like to call it new_wepy_h5 = WepyHDF5(path, expert_mode=True) # perform the surgery: # attach the h5py.File new_wepy_h5._h5 = new_h5 # set the wepy mode to read-write since the creation flags # were already used in construction of the h5py.File object new_wepy_h5._wepy_mode = 'r+' new_wepy_h5._h5py_mode = 'r+' # close the h5py.File and set the attribute to closed new_wepy_h5._h5.close() new_wepy_h5.closed = True # return the runless WepyHDF5 object return new_wepy_h5 @property def mode(self): return self._wepy_mode @property def h5_mode(self): return self._h5.mode @property def h5(self): return self._h5 @property def n_trajs(self): return len(list(self.run_traj_idx_tuples())) @property def settings_grp(self): settings_grp = self.h5['_settings'] return settings_grp @property def n_atoms(self): return self.h5['_settings/n_atoms'][()] @property def n_dims(self): return self.h5['_settings/n_dims'][()] @property def topology(self): """The topology for the full simulated system. May not be the main representation in the POSITIONS field; for that use the `topology` method. """ return self._h5['topology'][()] def get_mdtraj_topology(self, alt_rep=POSITIONS): """Get an MDTraj `Topology` object for a subset of the atoms in the positions of a particular representation. By default gives the topology for the main 'positions' field (when alt_rep 'positions'). To get the full topology the file was initialized with set `alt_rep` to `None`. Topologies for alternative representations (subfields of 'alt_reps') can be obtained by passing in the key for that alt_rep. For example, 'all_atoms' for the field in alt_reps called 'all_atoms'. """ full_mdj_top = json_to_mdtraj_topology(self.topology) if alt_rep is None: return full_mdj_top elif alt_rep == POSITIONS: # get the subset topology for the main rep idxs return full_mdj_top.subset(self.main_rep_idxs) elif alt_rep in self.alt_rep_idxs: # get the subset for the alt rep return full_mdj_top.subset(self.alt_rep_idxs[alt_rep]) else: raise ValueError("alt_rep {} not found".format(alt_rep)) def get_topology(self, alt_rep=POSITIONS): """Get a JSON topology for a subset of the atoms in the positions of a particular representation. By default gives the topology for the main 'positions' field (when alt_rep 'positions'). To get the full topology the file was initialized with set `alt_rep` to `None`. Topologies for alternative representations (subfields of 'alt_reps') can be obtained by passing in the key for that alt_rep. For example, 'all_atoms' for the field in alt_reps called 'all_atoms'. """ mdj_top = self.get_mdtraj_topology(alt_rep=alt_rep) json_top = mdtraj_to_json_topology(mdj_top) return json_top @property def sparse_fields(self): return self.h5['_settings/sparse_fields'][:] @property def main_rep_idxs(self): if '/_settings/main_rep_idxs' in self.h5: return self.h5['/_settings/main_rep_idxs'][:] else: return None @property def alt_rep_idxs(self): idxs_grp = self.h5['/_settings/alt_reps_idxs'] return {name : ds[:] for name, ds in idxs_grp.items()} @property def field_feature_shapes(self): shapes_grp = self.h5['_settings/field_feature_shapes'] field_paths = iter_field_paths(shapes_grp) shapes = {} for field_path in field_paths: shape = shapes_grp[field_path][()] if np.isnan(shape).all(): shapes[field_path] = None else: shapes[field_path] = shape return shapes @property def field_feature_dtypes(self): dtypes_grp = self.h5['_settings/field_feature_dtypes'] field_paths = iter_field_paths(dtypes_grp) dtypes = {} for field_path in field_paths: dtype_str = dtypes_grp[field_path][()] # if there is 'None' flag for the dtype then return None if dtype_str == 'None': dtypes[field_path] = None else: dtype_obj = json.loads(dtype_str) dtype_obj = [tuple(d) for d in dtype_obj] dtype = np.dtype(dtype_obj) dtypes[field_path] = dtype return dtypes @property def continuations(self): return self.settings_grp['continuations'][:] @property def metadata(self): return dict(self._h5.attrs) def add_metadata(self, key, value): self._h5.attrs[key] = value @property def runs(self): return self.h5['runs'].values() @property def n_runs(self): return len(self._h5['runs']) @property def run_idxs(self): return list(range(len(self._h5['runs']))) def next_run_idx(self): return self.n_runs def run(self, run_idx): return self._h5['runs/{}'.format(int(run_idx))] def traj(self, run_idx, traj_idx): return self._h5['runs/{}/trajectories/{}'.format(run_idx, traj_idx)] def traj_n_frames(self, run_idx, traj_idx): return self.traj(run_idx, traj_idx)[POSITIONS].shape[0] def run_n_frames(self, run_idx): return self.traj_n_frames(run_idx, 0) def run_n_cycles(self, run_idx): return self.run_n_frames(run_idx) def contig_n_cycles(self, run_idxs): # check the contig to make sure it is a valid contig if not self.is_contig(run_idxs): raise ValueError("The run_idxs provided are not a valid contig, {}.".format( run_idxs)) n_cycles = 0 for run_idx in run_idxs: n_cycles += self.run_n_cycles(run_idx) return n_cycles def run_trajs(self, run_idx): return self._h5['runs/{}/trajectories'.format(run_idx)] def n_run_trajs(self, run_idx): return len(self._h5['runs/{}/trajectories'.format(run_idx)]) def next_run_traj_idx(self, run_idx): return self.n_run_trajs(run_idx) def run_traj_idxs(self, run_idx): return range(len(self._h5['runs/{}/trajectories'.format(run_idx)])) def run_traj_idx_tuples(self, runs=None): tups = [] if runs is None: run_idxs = self.run_idxs else: run_idxs = runs for run_idx in run_idxs: for traj_idx in self.run_traj_idxs(run_idx): tups.append((run_idx, traj_idx)) return tups def _init_continuations(self): """This will either create a dataset in the settings for the continuations or if continuations already exist it will reinitialize them and delete the data that exists there. """ # if the continuations dset already exists we reinitialize the # data if 'continuations' in self.settings_grp: cont_dset = self.settings_grp['continuations'] cont_dset.resize( (0,2) ) # otherwise we just create the data else: cont_dset = self.settings_grp.create_dataset('continuations', shape=(0,2), dtype=np.int, maxshape=(None, 2)) return cont_dset def init_record_fields(self, run_record_key, record_fields): """Save which records are to be considered from a run record group's datasets to be in the table like representation. This exists to allow there to large and small datasets for records to be stored together but allow for a more compact single table like representation to be produced for serialization. """ record_fields_grp = self.settings_grp['record_fields'] # make a dataset for the sparse fields allowed. this requires # a 'special' datatype for variable length strings. This is # supported by HDF5 but not numpy. vlen_str_dt = h5py.special_dtype(vlen=str) # create the dataset with the strings of the fields which are records record_group_fields_ds = record_fields_grp.create_dataset(run_record_key, (len(record_fields),), dtype=vlen_str_dt, maxshape=(None,)) # set the flags for i, record_field in enumerate(record_fields): record_group_fields_ds[i] = record_field def init_resampling_record_fields(self, resampler): self.init_record_fields(RESAMPLING, resampler.resampling_record_field_names()) def init_resampler_record_fields(self, resampler): self.init_record_fields(RESAMPLER, resampler.resampler_record_field_names()) def init_bc_record_fields(self, bc): self.init_record_fields(BC, bc.bc_record_field_names()) def init_warping_record_fields(self, bc): self.init_record_fields(WARPING, bc.warping_record_field_names()) def init_progress_record_fields(self, bc): self.init_record_fields(PROGRESS, bc.progress_record_field_names()) @property def record_fields(self): record_fields_grp = self.settings_grp['record_fields'] record_fields_dict = {} for group_name, dset in record_fields_grp.items(): record_fields_dict[group_name] = list(dset) return record_fields_dict def _add_run_init(self, run_idx, continue_run=None): """Routines for creating a run includes updating and setting object global variables, increasing the counter for the number of runs.""" # add the run idx as metadata in the run group self._h5['runs/{}'.format(run_idx)].attrs['run_idx'] = run_idx # if this is continuing another run add the tuple (this_run, # continues_run) to the contig settings if continue_run is not None: self.add_continuation(run_idx, continue_run) def add_continuation(self, continuation_run, base_run): """Add a continuation between runs. continuation_run :: the run index of the run that continues base_run base_run :: the run that is being continued """ continuations_dset = self.settings_grp['continuations'] continuations_dset.resize((continuations_dset.shape[0] + 1, continuations_dset.shape[1],)) continuations_dset[continuations_dset.shape[0] - 1] = np.array([continuation_run, base_run]) def link_run(self, filepath, run_idx, continue_run=None, **kwargs): """Add a run from another file to this one as an HDF5 external link. Intuitively this is like mounting a drive in a filesystem.""" # link to the external run ext_run_link = h5py.ExternalLink(filepath, 'runs/{}'.format(run_idx)) # the run index in this file, as determined by the counter here_run_idx = self.next_run_idx() # set the local run as the external link to the other run self._h5['runs/{}'.format(here_run_idx)] = ext_run_link # run the initialization routines for adding a run self._add_run_init(here_run_idx, continue_run=continue_run) run_grp = self._h5['runs/{}'.format(here_run_idx)] # add metadata if given for key, val in kwargs.items(): if key != 'run_idx': run_grp.attrs[key] = val else: warn('run_idx metadata is set by wepy and cannot be used', RuntimeWarning) return here_run_idx def link_file_runs(self, wepy_h5_path): """Link all runs from another WepyHDF5 file. This preserves continuations within that file. This will open the file if not already opened. returns the indices of the new runs in this file. """ wepy_h5 = WepyHDF5(wepy_h5_path, mode='r') with wepy_h5: ext_run_idxs = wepy_h5.run_idxs continuations = wepy_h5.continuations # add the runs new_run_idxs = [] for ext_run_idx in ext_run_idxs: # link the next run, and get its new run index new_run_idx = self.link_run(wepy_h5_path, ext_run_idx) # save that run idx new_run_idxs.append(new_run_idx) # copy the continuations over translating the run idxs, # for each continuation in the other files continuations for continuation in continuations: # translate each run index from the external file # continuations to the run idxs they were just assigned in # this file self.add_continuation(new_run_idxs[continuation[0]], new_run_idxs[continuation[1]]) return new_run_idxs def new_run(self, continue_run=None, **kwargs): # check to see if the continue_run is actually in this file if continue_run is not None: if continue_run not in self.run_idxs: raise ValueError("The continue_run idx given, {}, is not present in this file".format( continue_run)) # get the index for this run new_run_idx = self.next_run_idx() # create a new group named the next integer in the counter run_grp = self._h5.create_group('runs/{}'.format(new_run_idx)) # initialize the walkers group traj_grp = run_grp.create_group('trajectories') # run the initialization routines for adding a run self._add_run_init(new_run_idx, continue_run=continue_run) # TODO get rid of this? # add metadata if given for key, val in kwargs.items(): if key != 'run_idx': run_grp.attrs[key] = val else: warn('run_idx metadata is set by wepy and cannot be used', RuntimeWarning) return run_grp # application level methods for setting the fields for run record # groups given the objects themselves def init_run_resampling(self, run_idx, resampler): # set the enumeration of the decisions self.init_run_resampling_decision(0, resampler) # set the data fields that can be used for table like records resampler.resampler_record_field_names() resampler.resampling_record_field_names() # then make the records group fields = resampler.resampling_fields() grp = self.init_run_record_grp(run_idx, RESAMPLING, fields) return grp def init_run_resampling_decision(self, run_idx, resampler): self.init_run_fields_resampling_decision(run_idx, resampler.DECISION.enum_dict_by_name()) def init_run_resampler(self, run_idx, resampler): fields = resampler.resampler_fields() grp = self.init_run_record_grp(run_idx, RESAMPLER, fields) return grp def init_run_warping(self, run_idx, bc): fields = bc.warping_fields() grp = self.init_run_record_grp(run_idx, WARPING, fields) return grp def init_run_progress(self, run_idx, bc): fields = bc.progress_fields() grp = self.init_run_record_grp(run_idx, PROGRESS, fields) return grp def init_run_bc(self, run_idx, bc): fields = bc.bc_fields() grp = self.init_run_record_grp(run_idx, BC, fields) return grp # application level methods for initializing the run records # groups with just the fields and without the objects def init_run_fields_resampling(self, run_idx, fields): grp = self.init_run_record_grp(run_idx, RESAMPLING, fields) return grp def init_run_fields_resampling_decision(self, run_idx, decision_enum_dict): decision_grp = self.run(run_idx).create_group('decision') for name, value in decision_enum_dict.items(): decision_grp.create_dataset(name, data=value) def init_run_fields_resampler(self, run_idx, fields): grp = self.init_run_record_grp(run_idx, RESAMPLER, fields) return grp def init_run_fields_warping(self, run_idx, fields): grp = self.init_run_record_grp(run_idx, WARPING, fields) return grp def init_run_fields_progress(self, run_idx, fields): grp = self.init_run_record_grp(run_idx, PROGRESS, fields) return grp def init_run_fields_bc(self, run_idx, fields): grp = self.init_run_record_grp(run_idx, BC, fields) return grp def init_run_record_grp(self, run_idx, run_record_key, fields): # initialize the record group based on whether it is sporadic # or continual if self._is_sporadic_records(run_record_key): grp = self._init_run_sporadic_record_grp(run_idx, run_record_key, fields) else: grp = self._init_run_continual_record_grp(run_idx, run_record_key, fields) def _init_run_sporadic_record_grp(self, run_idx, run_record_key, fields): # create the group run_grp = self.run(run_idx) record_grp = run_grp.create_group(run_record_key) # initialize the cycles dataset that maps when the records # were recorded record_grp.create_dataset(CYCLE_IDXS, (0,), dtype=np.int, maxshape=(None,)) # for each field simply create the dataset for field_name, field_shape, field_dtype in fields: # initialize this field self._init_run_records_field(run_idx, run_record_key, field_name, field_shape, field_dtype) return record_grp def _init_run_continual_record_grp(self, run_idx, run_record_key, fields): # create the group run_grp = self.run(run_idx) record_grp = run_grp.create_group(run_record_key) # for each field simply create the dataset for field_name, field_shape, field_dtype in fields: self._init_run_records_field(run_idx, run_record_key, field_name, field_shape, field_dtype) return record_grp def _init_run_records_field(self, run_idx, run_record_key, field_name, field_shape, field_dtype): record_grp = self.run(run_idx)[run_record_key] # check if it is variable length if field_shape is Ellipsis: # make a special dtype that allows it to be # variable length vlen_dt = h5py.special_dtype(vlen=field_dtype) # this is only allowed to be a single dimension # since no real shape was given dset = record_grp.create_dataset(field_name, (0,), dtype=vlen_dt, maxshape=(None,)) # its not just make it normally else: # create the group dset = record_grp.create_dataset(field_name, (0, *field_shape), dtype=field_dtype, maxshape=(None, *field_shape)) return dset def _is_sporadic_records(self, run_record_key): # assume it is continual and check if it is in the sporadic groups if run_record_key in SPORADIC_RECORDS: return True else: return False def _init_traj_field(self, run_idx, traj_idx, field_path, feature_shape, dtype): """Initialize a data field in the trajectory to be empty but resizeable.""" # check whether this is a sparse field and create it # appropriately if field_path in self.sparse_fields: # it is a sparse field self._init_sparse_traj_field(run_idx, traj_idx, field_path, feature_shape, dtype) else: # it is not a sparse field (AKA simple) self._init_contiguous_traj_field(run_idx, traj_idx, field_path, feature_shape, dtype) def _init_contiguous_traj_field(self, run_idx, traj_idx, field_path, shape, dtype): traj_grp = self._h5['runs/{}/trajectories/{}'.format(run_idx, traj_idx)] # create the empty dataset in the correct group, setting # maxshape so it can be resized for new feature vectors to be added traj_grp.create_dataset(field_path, (0, *[0 for i in shape]), dtype=dtype, maxshape=(None, *shape)) def _init_sparse_traj_field(self, run_idx, traj_idx, field_path, shape, dtype): traj_grp = self._h5['runs/{}/trajectories/{}'.format(run_idx, traj_idx)] # check to see that neither the shape and dtype are # None which indicates it is a runtime defined value and # should be ignored here if (shape is None) or (dtype is None): # do nothing pass else: # only create the group if you are going to add the # datasets so the extend function can know if it has been # properly initialized easier sparse_grp = traj_grp.create_group(field_path) # create the dataset for the feature data sparse_grp.create_dataset('data', (0, *[0 for i in shape]), dtype=dtype, maxshape=(None, *shape)) # create the dataset for the sparse indices sparse_grp.create_dataset('_sparse_idxs', (0,), dtype=np.int, maxshape=(None,)) def _init_traj_fields(self, run_idx, traj_idx, field_paths, field_feature_shapes, field_feature_dtypes): for i, field_path in enumerate(field_paths): self._init_traj_field(run_idx, traj_idx, field_path, field_feature_shapes[i], field_feature_dtypes[i]) def traj_n_frames(self, run_idx, traj_idx): return self.traj(run_idx, traj_idx)[POSITIONS].shape[0] def add_traj(self, run_idx, data, weights=None, sparse_idxs=None, metadata=None): # convenient alias traj_data = data # initialize None kwargs if sparse_idxs is None: sparse_idxs = {} if metadata is None: metadata = {} # positions are mandatory assert POSITIONS in traj_data, "positions must be given to create a trajectory" assert isinstance(traj_data[POSITIONS], np.ndarray) n_frames = traj_data[POSITIONS].shape[0] # if weights are None then we assume they are 1.0 if weights is None: weights = np.ones((n_frames, 1), dtype=float) else: assert isinstance(weights, np.ndarray), "weights must be a numpy.ndarray" assert weights.shape[0] == n_frames,\ "weights and the number of frames must be the same length" # current traj_idx traj_idx = self.next_run_traj_idx(run_idx) # make a group for this trajectory, with the current traj_idx # for this run traj_grp = self._h5.create_group( 'runs/{}/trajectories/{}'.format(run_idx, traj_idx)) # add the run_idx as metadata traj_grp.attrs['run_idx'] = run_idx # add the traj_idx as metadata traj_grp.attrs['traj_idx'] = traj_idx # add the rest of the metadata if given for key, val in metadata.items(): if not key in ['run_idx', 'traj_idx']: traj_grp.attrs[key] = val else: warn("run_idx and traj_idx are used by wepy and cannot be set", RuntimeWarning) # check to make sure the positions are the right shape assert traj_data[POSITIONS].shape[1] == self.n_atoms, \ "positions given have different number of atoms: {}, should be {}".format( traj_data[POSITIONS].shape[1], self.n_atoms) assert traj_data[POSITIONS].shape[2] == self.n_dims, \ "positions given have different number of dims: {}, should be {}".format( traj_data[POSITIONS].shape[2], self.n_dims) # add datasets to the traj group # weights traj_grp.create_dataset(WEIGHTS, data=weights, maxshape=(None, *WEIGHT_SHAPE)) # positions positions_shape = traj_data[POSITIONS].shape # add the rest of the traj_data for field_path, field_data in traj_data.items(): # if there were sparse idxs for this field pass them in if field_path in sparse_idxs: field_sparse_idxs = sparse_idxs[field_path] # if this is a sparse field and no sparse_idxs were given # we still need to initialize it as a sparse field so it # can be extended properly so we make sparse_idxs to match # the full length of this initial trajectory data elif field_path in self.sparse_fields: field_sparse_idxs = np.arange(positions_shape[0]) # otherwise it is not a sparse field so we just pass in None else: field_sparse_idxs = None self._add_traj_field_data(run_idx, traj_idx, field_path, field_data, sparse_idxs=field_sparse_idxs) ## initialize empty sparse fields # get the sparse field datasets that haven't been initialized traj_init_fields = list(sparse_idxs.keys()) + list(traj_data.keys()) uninit_sparse_fields = set(self.sparse_fields).difference(traj_init_fields) # the shapes uninit_sparse_shapes = [self.field_feature_shapes[field] for field in uninit_sparse_fields] # the dtypes uninit_sparse_dtypes = [self.field_feature_dtypes[field] for field in uninit_sparse_fields] # initialize the sparse fields in the hdf5 self._init_traj_fields(run_idx, traj_idx, uninit_sparse_fields, uninit_sparse_shapes, uninit_sparse_dtypes) return traj_grp def _add_traj_field_data(self, run_idx, traj_idx, field_path, field_data, sparse_idxs=None): # get the traj group traj_grp = self._h5['runs/{}/trajectories/{}'.format(run_idx, traj_idx)] # if it is a sparse dataset we need to add the data and add # the idxs in a group if sparse_idxs is None: traj_grp.create_dataset(field_path, data=field_data, maxshape=(None, *field_data.shape[1:])) else: sparse_grp = traj_grp.create_group(field_path) # add the data to this group sparse_grp.create_dataset('data', data=field_data, maxshape=(None, *field_data.shape[1:])) # add the sparse idxs sparse_grp.create_dataset('_sparse_idxs', data=sparse_idxs, maxshape=(None,)) def _extend_dataset(self, dset_path, new_data): dset = self.h5[dset_path] extend_dataset(dset, new_data) def _extend_contiguous_traj_field(self, run_idx, traj_idx, field_path, field_data): traj_grp = self.h5['/runs/{}/trajectories/{}'.format(run_idx, traj_idx)] field = traj_grp[field_path] # make sure this is a feature vector assert len(field_data.shape) > 1, \ "field_data must be a feature vector with the same number of dimensions as the number" # of datase new frames n_new_frames = field_data.shape[0] # check the field to make sure it is not empty if all([i == 0 for i in field.shape]): # check the feature shape against the maxshape which gives # the feature dimensions for an empty dataset assert field_data.shape[1:] == field.maxshape[1:], \ "field feature dimensions must be the same, i.e. all but the first dimension" # if it is empty resize it to make an array the size of # the new field_data with the maxshape for the feature # dimensions feature_dims = field.maxshape[1:] field.resize( (n_new_frames, *feature_dims) ) # set the new data to this field[0:, ...] = field_data else: # make sure the new data has the right dimensions against # the shape it already has assert field_data.shape[1:] == field.shape[1:], \ "field feature dimensions must be the same, i.e. all but the first dimension" # append to the dataset on the first dimension, keeping the # others the same, these must be feature vectors and therefore # must exist field.resize( (field.shape[0] + n_new_frames, *field.shape[1:]) ) # add the new data field[-n_new_frames:, ...] = field_data def _extend_sparse_traj_field(self, run_idx, traj_idx, field_path, values, sparse_idxs): field = self.h5['/runs/{}/trajectories/{}/{}'.format(run_idx, traj_idx, field_path)] field_data = field['data'] field_sparse_idxs = field['_sparse_idxs'] # number of new frames n_new_frames = values.shape[0] # if this sparse_field has been initialized empty we need to resize if all([i == 0 for i in field_data.shape]): # check the feature shape against the maxshape which gives # the feature dimensions for an empty dataset assert values.shape[1:] == field_data.maxshape[1:], \ "input value features have shape {}, expected {}".format( values.shape[1:], field_data.maxshape[1:]) # if it is empty resize it to make an array the size of # the new values with the maxshape for the feature # dimensions feature_dims = field_data.maxshape[1:] field_data.resize( (n_new_frames, *feature_dims) ) # set the new data to this field_data[0:, ...] = values else: # make sure the new data has the right dimensions assert values.shape[1:] == field_data.shape[1:], \ "field feature dimensions must be the same, i.e. all but the first dimension" # append to the dataset on the first dimension, keeping the # others the same, these must be feature vectors and therefore # must exist field_data.resize( (field_data.shape[0] + n_new_frames, *field_data.shape[1:]) ) # add the new data field_data[-n_new_frames:, ...] = values # add the sparse idxs in the same way field_sparse_idxs.resize( (field_sparse_idxs.shape[0] + n_new_frames, *field_sparse_idxs.shape[1:]) ) # add the new data field_sparse_idxs[-n_new_frames:, ...] = sparse_idxs def extend_traj(self, run_idx, traj_idx, data, weights=None): if self._wepy_mode == 'c-': assert self._append_flags[dataset_key], "dataset is not available for appending to" # convenient alias traj_data = data # number of frames to add n_new_frames = traj_data[POSITIONS].shape[0] n_frames = self.traj_n_frames(run_idx, traj_idx) # calculate the new sparse idxs for sparse fields that may be # being added sparse_idxs = np.array(range(n_frames, n_frames + n_new_frames)) # get the trajectory group traj_grp = self._h5['runs/{}/trajectories/{}'.format(run_idx, traj_idx)] ## weights # if weights are None then we assume they are 1.0 if weights is None: weights = np.ones((n_new_frames, 1), dtype=float) else: assert isinstance(weights, np.ndarray), "weights must be a numpy.ndarray" assert weights.shape[0] == n_new_frames,\ "weights and the number of frames must be the same length" # add the weights weights_ds = traj_grp[WEIGHTS] # append to the dataset on the first dimension, keeping the # others the same, if they exist if len(weights_ds.shape) > 1: weights_ds.resize( (weights_ds.shape[0] + n_new_frames, *weights_ds.shape[1:]) ) else: weights_ds.resize( (weights_ds.shape[0] + n_new_frames, ) ) # add the new data weights_ds[-n_new_frames:, ...] = weights # add the other fields for field_path, field_data in traj_data.items(): # if the field hasn't been initialized yet initialize it if not field_path in traj_grp: feature_shape = field_data.shape[1:] feature_dtype = field_data.dtype # not specified as sparse_field, no settings if (not field_path in self.field_feature_shapes) and \ (not field_path in self.field_feature_dtypes) and \ not field_path in self.sparse_fields: # only save if it is an observable is_observable = False if '/' in field_path: group_name = field_path.split('/')[0] if group_name == OBSERVABLES: is_observable = True if is_observable: warn("the field '{}' was received but not previously specified" " but is being added because it is in observables.".format(field_path)) # save sparse_field flag, shape, and dtype self._add_sparse_field_flag(field_path) self._set_field_feature_shape(field_path, feature_shape) self._set_field_feature_dtype(field_path, feature_dtype) else: raise ValueError("the field '{}' was received but not previously specified" "it is being ignored because it is not an observable.".format(field_path)) # specified as sparse_field but no settings given elif (self.field_feature_shapes[field_path] is None and self.field_feature_dtypes[field_path] is None) and \ field_path in self.sparse_fields: # set the feature shape and dtype since these # should be 0 in the settings self._set_field_feature_shape(field_path, feature_shape) self._set_field_feature_dtype(field_path, feature_dtype) # initialize self._init_traj_field(run_idx, traj_idx, field_path, feature_shape, feature_dtype) # extend it either as a sparse field or a contiguous field if field_path in self.sparse_fields: self._extend_sparse_traj_field(run_idx, traj_idx, field_path, field_data, sparse_idxs) else: self._extend_contiguous_traj_field(run_idx, traj_idx, field_path, field_data) def _add_sparse_field_flag(self, field_path): sparse_fields_ds = self._h5['_settings/sparse_fields'] # make sure it isn't already in the sparse_fields if field_path in sparse_fields_ds[:]: warn("sparse field {} already a sparse field, ignoring".format(field_path)) sparse_fields_ds.resize( (sparse_fields_ds.shape[0] + 1,) ) sparse_fields_ds[sparse_fields_ds.shape[0] - 1] = field_path def _add_field_feature_shape(self, field_path, field_feature_shape): shapes_grp = self._h5['_settings/field_feature_shapes'] shapes_grp.create_dataset(field_path, data=np.array(field_feature_shape)) def _add_field_feature_dtype(self, field_path, field_feature_dtype): feature_dtype_str = json.dumps(field_feature_dtype.descr) dtypes_grp = self._h5['_settings/field_feature_dtypes'] dtypes_grp.create_dataset(field_path, data=feature_dtype_str) def _set_field_feature_shape(self, field_path, field_feature_shape): # check if the field_feature_shape is already set if field_path in self.field_feature_shapes: # check that the shape was previously saved as "None" as we # won't overwrite anything else if self.field_feature_shapes[field_path] is None: full_path = '_settings/field_feature_shapes/{}'.format(field_path) # we have to delete the old data and set new data del self.h5[full_path] self.h5.create_dataset(full_path, data=field_feature_shape) else: raise AttributeError( "Cannot overwrite feature shape for {} with {} because it is {} not 'None'".format( field_path, field_feature_shape, self.field_feature_shapes[field_path])) # it was not previously set so we must create then save it else: self._add_field_feature_shape(field_path, field_feature_shape) def _set_field_feature_dtype(self, field_path, field_feature_dtype): feature_dtype_str = json.dumps(field_feature_dtype.descr) # check if the field_feature_dtype is already set if field_path in self.field_feature_dtypes: # check that the dtype was previously saved as "None" as we # won't overwrite anything else if self.field_feature_dtypes[field_path] is None: full_path = '_settings/field_feature_dtypes/{}'.format(field_path) # we have to delete the old data and set new data del self.h5[full_path] self.h5.create_dataset(full_path, data=feature_dtype_str) else: raise AttributeError( "Cannot overwrite feature dtype for {} with {} because it is {} not 'None'".format( field_path, field_feature_dtype, self.field_feature_dtypes[field_path])) # it was not previously set so we must create then save it else: self._add_field_feature_dtype(field_path, field_feature_dtype) def decision_grp(self, run_idx): return self.run(run_idx)['decision'] def decision_enum(self, run_idx): enum_grp = self.decision_grp(run_idx) enum = {} for decision_name, dset in enum_grp.items(): enum[decision_name] = dset[()] return enum def decision_value_names(self, run_idx): enum_grp = self.decision_grp(run_idx) rev_enum = {} for decision_name, dset in enum_grp.items(): value = dset[()] rev_enum[value] = decision_name return rev_enum ## application level append methods for run records groups def extend_cycle_warping_records(self, run_idx, cycle_idx, warping_data): self.extend_cycle_run_group_records(run_idx, WARPING, cycle_idx, warping_data) def extend_cycle_bc_records(self, run_idx, cycle_idx, bc_data): self.extend_cycle_run_group_records(run_idx, BC, cycle_idx, bc_data) def extend_cycle_progress_records(self, run_idx, cycle_idx, progress_data): self.extend_cycle_run_group_records(run_idx, PROGRESS, cycle_idx, progress_data) def extend_cycle_resampling_records(self, run_idx, cycle_idx, resampling_data): self.extend_cycle_run_group_records(run_idx, RESAMPLING, cycle_idx, resampling_data) def extend_cycle_resampler_records(self, run_idx, cycle_idx, resampler_data): self.extend_cycle_run_group_records(run_idx, RESAMPLER, cycle_idx, resampler_data) def extend_cycle_run_group_records(self, run_idx, run_record_key, cycle_idx, fields_data): """Append data for a whole records group, that is every field dataset. This must have the cycle index for the data it is appending as this is done for sporadic and continual datasets. """ record_grp = self.records_grp(run_idx, run_record_key) # if it is sporadic add the cycle idx if self._is_sporadic_records(run_record_key): # get the cycle idxs dataset record_cycle_idxs_ds = record_grp[CYCLE_IDXS] # number of old and new records n_new_records = len(fields_data) n_existing_records = record_cycle_idxs_ds.shape[0] # make a new chunk for the new records record_cycle_idxs_ds.resize( (n_existing_records + n_new_records,) ) # add an array of the cycle idx for each record record_cycle_idxs_ds[n_existing_records:] = np.full((n_new_records,), cycle_idx) # then add all the data for the field for record_dict in fields_data: for field_name, field_data in record_dict.items(): self._extend_run_record_data_field(run_idx, run_record_key, field_name, np.array([field_data])) def _extend_run_record_data_field(self, run_idx, run_record_key, field_name, field_data): """Adds data for a single field dataset in a run records group. This is done without paying attention to whether it is sporadic or continual and is supposed to be only the data write method. """ records_grp = self.h5['runs/{}/{}'.format(run_idx, run_record_key)] field = records_grp[field_name] # make sure this is a feature vector assert len(field_data.shape) > 1, \ "field_data must be a feature vector with the same number of dimensions as the number" # of datase new frames n_new_frames = field_data.shape[0] # check whether it is a variable length record, by getting the # record dataset dtype and using the checker to see if it is # the vlen special type in h5py if h5py.check_dtype(vlen=field.dtype) is not None: # if it is we have to treat it differently, since it # cannot be multidimensional # if the dataset has no data in it we need to reshape it if all([i == 0 for i in field.shape]): # initialize this array # if it is empty resize it to make an array the size of # the new field_data with the maxshape for the feature # dimensions field.resize( (n_new_frames,) ) # set the new data to this for i, row in enumerate(field_data): field[i] = row # otherwise just add the data else: # resize the array but it is only of rank because # of variable length data field.resize( (field.shape[0] + n_new_frames, ) ) # add each row to the newly made space for i, row in enumerate(field_data): field[(field.shape[0] - 1) + i] = row # if it is not variable length we don't have to treat it # differently else: # if this is empty we need to reshape the dataset to accomodate data if all([i == 0 for i in field.shape]): # check the feature shape against the maxshape which gives # the feature dimensions for an empty dataset assert field_data.shape[1:] == field.maxshape[1:], \ "field feature dimensions must be the same, i.e. all but the first dimension" # if it is empty resize it to make an array the size of # the new field_data with the maxshape for the feature # dimensions feature_dims = field.maxshape[1:] field.resize( (n_new_frames, *feature_dims) ) # set the new data to this field[0:, ...] = field_data # otherwise just add the data else: # append to the dataset on the first dimension, keeping the # others the same, these must be feature vectors and therefore # must exist field.resize( (field.shape[0] + n_new_frames, *field.shape[1:]) ) # add the new data field[-n_new_frames:, ...] = field_data def get_traj_field_cycle_idxs(self, run_idx, traj_idx, field_path): """ Returns the sparse indices for a field""" traj_path = "/runs/{}/trajectories/{}".format(run_idx, traj_idx) # if the field doesn't exist return None if not field_path in self._h5[traj_path]: raise KeyError("key for field {} not found".format(field_path)) # return None # if the field is not sparse just return the cycle indices for # that run if field_path not in self.sparse_fields: cycle_idxs = np.array(range(self.run_n_cycles(run_idx))) else: cycle_idxs = self._h5[traj_path][field_path]['_sparse_idxs'][:] return cycle_idxs def get_traj_field(self, run_idx, traj_idx, field_path, frames=None, masked=True): """Returns a numpy array for the given field. You can control how sparse fields are returned using the `masked` option. When True (default) a masked numpy array will be returned such that you can get which cycles it is from, when False an unmasked array of the data will be returned which has no cycle information. """ traj_path = "/runs/{}/trajectories/{}".format(run_idx, traj_idx) # if the field doesn't exist return None if not field_path in self._h5[traj_path]: raise KeyError("key for field {} not found".format(field_path)) # return None # get the field depending on whether it is sparse or not if field_path in self.sparse_fields: return self._get_sparse_traj_field(run_idx, traj_idx, field_path, frames=frames, masked=masked) else: return self._get_contiguous_traj_field(run_idx, traj_idx, field_path, frames=frames) def _get_contiguous_traj_field(self, run_idx, traj_idx, field_path, frames=None): full_path = "/runs/{}/trajectories/{}/{}".format(run_idx, traj_idx, field_path) if frames is None: field = self._h5[full_path][:] else: field = self._h5[full_path][list(frames)] return field def _get_sparse_traj_field(self, run_idx, traj_idx, field_path, frames=None, masked=True): traj_path = "/runs/{}/trajectories/{}".format(run_idx, traj_idx) traj_grp = self.h5[traj_path] field = traj_grp[field_path] n_frames = traj_grp[POSITIONS].shape[0] if frames is None: data = field['data'][:] # if it is to be masked make the masked array if masked: sparse_idxs = field['_sparse_idxs'][:] filled_data = np.full( (n_frames, *data.shape[1:]), np.nan) filled_data[sparse_idxs] = data mask = np.full( (n_frames, *data.shape[1:]), True) mask[sparse_idxs] = False data = np.ma.masked_array(filled_data, mask=mask) else: # get the sparse idxs and the frames to slice from the # data sparse_idxs = field['_sparse_idxs'][:] # we get a boolean array of the rows of the data table # that we are to slice from sparse_frame_idxs = np.argwhere(np.isin(sparse_idxs, frames)) data = field['data'][list(sparse_frame_idxs)] # if it is to be masked make the masked array if masked: # the empty arrays the size of the number of requested frames filled_data = np.full( (len(frames), *field['data'].shape[1:]), np.nan) mask = np.full( (len(frames), *field['data'].shape[1:]), True ) # take the data which exists and is part of the frames # selection, and put it into the filled data where it is # supposed to be filled_data[np.isin(frames, sparse_idxs)] = data # unmask the present values mask[np.isin(frames, sparse_idxs)] = False data = np.ma.masked_array(filled_data, mask=mask) return data def get_trace_fields(self, frame_tups, fields): frame_fields = {field : [] for field in fields} for run_idx, traj_idx, cycle_idx in frame_tups: for field in fields: frame_field = self.get_traj_field(run_idx, traj_idx, field, frames=[cycle_idx]) # the first dimension doesn't matter here since we # only get one frame at a time. frame_fields[field].append(frame_field[0]) # combine all the parts of each field into single arrays for field in fields: frame_fields[field] = np.array(frame_fields[field]) return frame_fields def get_run_trace_fields(self, run_idx, frame_tups, fields): frame_fields = {field : [] for field in fields} for traj_idx, cycle_idx in frame_tups: for field in fields: frame_field = self.get_traj_field(run_idx, traj_idx, field, frames=[cycle_idx]) # the first dimension doesn't matter here since we # only get one frame at a time. frame_fields[field].append(frame_field[0]) # combine all the parts of each field into single arrays for field in fields: frame_fields[field] = np.array(frame_fields[field]) return frame_fields def _add_run_field(self, run_idx, field_path, data, sparse_idxs=None): """ Add a field to your trajectories runs""" # check that the data has the correct number of trajectories assert len(data) == self.n_run_trajs(run_idx),\ "The number of trajectories in data, {}, is different than the number"\ "of trajectories in the run, {}.".format(len(data), self.n_run_trajs(run_idx)) # for each trajectory check that the data is compliant for traj_idx, traj_data in enumerate(data): # check that the number of frames is not larger than that for the run if traj_data.shape[0] > self.run_n_frames(run_idx): raise ValueError("The number of frames in data for traj {} , {}," "is larger than the number of frames" "for this run, {}.".format( traj_idx, data.shape[1], self.run_n_frames(run_idx))) # if the number of frames given is the same or less than # the number of frames in the run elif (traj_data.shape[0] <= self.run_n_frames(run_idx)): # if sparse idxs were given we check to see there is # the right number of them if sparse_idxs is not None: # and that they match the number of frames given if data.shape[0] != len(sparse_idxs[traj_idx]): raise ValueError("The number of frames provided for traj {}, {}," "was less than the total number of frames, {}," "but an incorrect number of sparse idxs were supplied, {}."\ .format(traj_idx, traj_data.shape[0], self.run_n_frames(run_idx), len(sparse_idxs[traj_idx]))) # if there were strictly fewer frames given and the # sparse idxs were not given we need to raise an error elif (traj_data.shape[0] < self.run_n_frames(run_idx)): raise ValueError("The number of frames provided for traj {}, {}," "was less than the total number of frames, {}," "but sparse_idxs were not supplied.".format( traj_idx, traj_data.shape[0], self.run_n_frames(run_idx))) # add it to each traj for i, idx_tup in enumerate(self.run_traj_idx_tuples([run_idx])): if sparse_idxs is None: self._add_traj_field_data(*idx_tup, field_path, data[i]) else: self._add_traj_field_data(*idx_tup, field_path, data[i], sparse_idxs=sparse_idxs[i]) def _add_field(self, field_path, data, sparse_idxs=None): for i, run_idx in enumerate(self.run_idxs): if sparse_idxs is not None: self._add_run_field(run_idx, field_path, data[i], sparse_idxs=sparse_idxs[i]) else: self._add_run_field(run_idx, field_path, data[i]) def iter_runs(self, idxs=False, run_sel=None): """Iterate through runs. idxs : if True returns `(run_idx, run_group)`, False just `run_group` run_sel : if True will iterate over a subset of runs. Possible values are an iterable of indices of runs to iterate over. """ if run_sel is None: run_sel = self.run_idxs for run_idx in self.run_idxs: if run_idx in run_sel: run = self.run(run_idx) if idxs: yield run_idx, run else: yield run def iter_trajs(self, idxs=False, traj_sel=None): """Generator for all of the trajectories in the dataset across all runs. If idxs=True will return a tuple of (run_idx, traj_idx). run_sel : if True will iterate over a subset of trajectories. Possible values are an iterable of `(run_idx, traj_idx)` tuples. """ # set the selection of trajectories to iterate over if traj_sel is None: idx_tups = self.run_traj_idx_tuples() else: idx_tups = traj_sel # get each traj for each idx_tup and yield them for the generator for run_idx, traj_idx in idx_tups: traj = self.traj(run_idx, traj_idx) if idxs: yield (run_idx, traj_idx), traj else: yield traj def iter_run_trajs(self, run_idx, idxs=False): run_sel = self.run_traj_idx_tuples([run_idx]) return self.iter_trajs(idxs=idxs, traj_sel=run_sel) def iter_trajs_fields(self, fields, idxs=False, traj_sel=None): """Generator for all of the specified non-compound fields h5py.Datasets for all trajectories in the dataset across all runs. Fields is a list of valid relative paths to datasets in the trajectory groups. """ for idx_tup, traj in self.iter_trajs(idxs=True, traj_sel=traj_sel): run_idx, traj_idx = idx_tup dsets = {} # DEBUG if we ask for debug prints send in the run and # traj index so the function can print this out dsets['run_idx'] = run_idx dsets['traj_idx'] = traj_idx for field in fields: try: dset = traj[field][:] except KeyError: warn("field \"{}\" not found in \"{}\"".format(field, traj.name), RuntimeWarning) dset = None dsets[field] = dset if idxs: yield (run_idx, traj_idx), dsets else: yield dsets def run_map(self, func, *args, map_func=map, idxs=False, run_sel=None): """Function for mapping work onto trajectories in the WepyHDF5 file object. The call to iter_runs is run with `idxs=False`. func : the function that will be mapped to trajectory groups map_func : the function that maps the function. This is where parallelization occurs if desired. Defaults to the serial python map function. traj_sel : a trajectory selection. This is a valid `traj_sel` argument for the `iter_trajs` function. idxs : if True results contain [(run_idx, result),...], if False returns [result,...] *args : additional arguments to the function. If this is an iterable it will be assumed that it is the appropriate length for the number of trajectories, WARNING: this will not be checked and could result in a run time error. Otherwise single values will be automatically mapped to all trajectories. **kwargs : same as *args, but will pass all kwargs to the func. """ # check the args and kwargs to see if they need expanded for # mapping inputs mapped_args = [] for arg in args: # if it is a sequence or generator we keep just pass it to the mapper if isinstance(arg, Sequence) and not isinstance(arg, str): assert len(arg) == self.n_runs, \ "argument Sequence has fewer number of args then trajectories" mapped_args.append(arg) # if it is not a sequence or generator we make a generator out # of it to map as inputs else: mapped_arg = (arg for i in range(self.n_runs)) mapped_args.append(mapped_arg) results = map_func(func, self.iter_runs(idxs=False, run_sel=run_sel), *mapped_args) if idxs: if run_sel is None: run_sel = self.run_idxs return zip(run_sel, results) else: return results def traj_map(self, func, *args, map_func=map, idxs=False, traj_sel=None): """Function for mapping work onto trajectories in the WepyHDF5 file object. func : the function that will be mapped to trajectory groups map_func : the function that maps the function. This is where parallelization occurs if desired. Defaults to the serial python map function. traj_sel : a trajectory selection. This is a valid `traj_sel` argument for the `iter_trajs` function. *args : additional arguments to the function. If this is an iterable it will be assumed that it is the appropriate length for the number of trajectories, WARNING: this will not be checked and could result in a run time error. Otherwise single values will be automatically mapped to all trajectories. """ # check the args and kwargs to see if they need expanded for # mapping inputs mapped_args = [] for arg in args: # if it is a sequence or generator we keep just pass it to the mapper if isinstance(arg, Sequence) and not isinstance(arg, str): assert len(arg) == self.n_trajs, "Sequence has fewer" mapped_args.append(arg) # if it is not a sequence or generator we make a generator out # of it to map as inputs else: mapped_arg = (arg for i in range(self.n_trajs)) mapped_args.append(mapped_arg) results = map_func(func, self.iter_trajs(traj_sel=traj_sel), *mapped_args) if idxs: if traj_sel is None: traj_sel = self.run_traj_idx_tuples() return zip(traj_sel, results) else: return results def traj_fields_map(self, func, fields, *args, map_func=map, idxs=False, traj_sel=None): """Function for mapping work onto field of trajectories in the WepyHDF5 file object. Similar to traj_map, except `h5py.Group` objects cannot be pickled for message passing. So we select the fields to serialize instead and pass the `numpy.ndarray`s to have the work mapped to them. func : the function that will be mapped to trajectory groups fields : list of fields that will be serialized into a dictionary and passed to the map function. These must be valid `h5py` path strings relative to the trajectory group. These include the standard fields like 'positions' and 'weights', as well as compound paths e.g. 'observables/sasa'. map_func : the function that maps the function. This is where parallelization occurs if desired. Defaults to the serial python map function. traj_sel : a trajectory selection. This is a valid `traj_sel` argument for the `iter_trajs` function. *args : additional arguments to the function. If this is an iterable it will be assumed that it is the appropriate length for the number of trajectories, WARNING: this will not be checked and could result in a run time error. Otherwise single values will be automatically mapped to all trajectories. """ # check the args and kwargs to see if they need expanded for # mapping inputs #first go through each run and get the number of cycles n_cycles = 0 for run_idx in self.run_idxs: n_cycles += self.run_n_cycles(run_idx) mapped_args = [] for arg in args: # if it is a sequence or generator we keep just pass it to the mapper if isinstance(arg, list) and not isinstance(arg, str): assert len(arg) == len(n_cycles), "Sequence has fewer" mapped_args.append(arg) # if it is not a sequence or generator we make a generator out # of it to map as inputs else: mapped_arg = (arg for i in range(n_cycles)) mapped_args.append(mapped_arg) results = map_func(func, self.iter_trajs_fields(fields, traj_sel=traj_sel, idxs=False), *mapped_args) if idxs: if traj_sel is None: traj_sel = self.run_traj_idx_tuples() return zip(traj_sel, results) else: return results def add_run_observable(self, run_idx, observable_name, data, sparse_idxs=None): obs_path = "{}/{}".format(OBSERVABLES, observable_name) self._add_run_field(run_idx, obs_path, data, sparse_idxs=sparse_idxs) def add_observable(self, observable_name, data, sparse_idxs=None): obs_path = "{}/{}".format(OBSERVABLES, observable_name) self._add_field(obs_path, data, sparse_idxs=sparse_idxs) def compute_observable(self, func, fields, *args, map_func=map, traj_sel=None, save_to_hdf5=None, idxs=False, return_results=True): """Compute an observable on the trajectory data according to a function. Optionally save that data in the observables data group for the trajectory. """ if save_to_hdf5 is not None: assert self.mode in ['w', 'w-', 'x', 'r+', 'c', 'c-'],\ "File must be in a write mode" assert isinstance(save_to_hdf5, str),\ "`save_to_hdf5` should be the field name to save the data in the `observables`"\ " group in each trajectory" field_name=save_to_hdf5 # DEBUG enforce this until sparse trajectories are implemented # assert traj_sel is None, "no selections until sparse trajectory data is implemented" if return_results: results = [] for result in self.traj_fields_map(func, fields, *args, map_func=map_func, traj_sel=traj_sel, idxs=True): idx_tup, obs_features = result run_idx, traj_idx = idx_tup # if we are saving this to the trajectories observables add it as a dataset if save_to_hdf5: logging.info("Saving run {} traj {} observables/{}".format( run_idx, traj_idx, field_name)) # try to get the observables group or make it if it doesn't exist try: obs_grp = self.traj(run_idx, traj_idx)[OBSERVABLES] except KeyError: logging.info("Group uninitialized. Initializing.") obs_grp = self.traj(run_idx, traj_idx).create_group(OBSERVABLES) # try to create the dataset try: obs_grp.create_dataset(field_name, data=obs_features) # if it fails we either overwrite or raise an error except RuntimeError: # if we are in a permissive write mode we delete the # old dataset and add the new one, overwriting old data if self.mode in ['w', 'w-', 'x', 'r+']: logging.info("Dataset already present. Overwriting.") del obs_grp[field_name] obs_grp.create_dataset(field_name, data=obs_features) # this will happen in 'c' and 'c-' modes else: raise RuntimeError( "Dataset already exists and file is in concatenate mode ('c' or 'c-')") # also return it if requested if return_results: if idxs: results.append(( idx_tup, obs_features)) else: results.append(obs_features) if return_results: return results @classmethod def _spanning_paths(cls, edges, root): # nodes targetting this root root_sources = [] # go through all the edges and find those with this # node as their target for edge_source, edge_target in edges: # check if the target_node we are looking for matches # the edge target node if root == edge_target: # if this root is a target of the source add it to the # list of edges targetting this root root_sources.append(edge_source) # from the list of source nodes targetting this root we choose # the lowest index one, so we sort them and iterate through # finding the paths starting from it recursively root_paths = [] root_sources.sort() for new_root in root_sources: # add these paths for this new root to the paths for the # current root root_paths.extend(cls._spanning_paths(edges, new_root)) # if there are no more sources to this root it is a leaf node and # we terminate recursion, by not entering the loop above, however # we manually generate an empty list for a path so that we return # this "root" node as a leaf, for default. if len(root_paths) < 1: root_paths = [[]] final_root_paths = [] for root_path in root_paths: final_root_paths.append([root] + root_path) return final_root_paths def spanning_contigs(self): """Returns a list of all possible spanning contigs given the continuations present in this file. Contigs are a list of runs in the order that makes a continuous set of data. Spanning contigs are always as long as possible, thus all must start from a root and end at a leaf node. This algorithm always returns them in a canonical order (as long as the runs are not rearranged after being added). This means that the indices here are the indices of the contigs. Contigs can in general are any such path drawn from what we call the "contig tree" which is the tree (or forest of trees) generated by the directed edges of the 'continuations'. They needn't be spanning from root to leaf. """ # a list of all the unique nodes (runs) nodes = list(self.run_idxs) # first find the roots of the forest, start with all of them # and eliminate them if any directed edge points out of them # (in the first position of a continuation; read a # continuation (say (B,A)) as B continues A, thus the edge is # A <- B) roots = nodes for edge_source, edge_target in self.continuations: # if the edge source node is still in roots pop it out, # because a root can never be a source in an edge if edge_source in roots: _ = roots.pop(roots.index(edge_source)) # collect all the spanning contigs spanning_contigs = [] # roots should be sorted already, so we just iterate over them for root in roots: # get the spanning paths by passing the continuation edges # and this root to this recursive static method root_spanning_contigs = self._spanning_paths(self.continuations, root) spanning_contigs.extend(root_spanning_contigs) return spanning_contigs def contig_tree(self): """Returns a networkx directed graph where each node is a run index and the connections between them are the continuations. """ contig_tree = nx.DiGraph() # first add all the runs as nodes contig_tree.add_nodes_from(self.run_idxs) # then add the continuations as edges contig_tree.add_edges_from(self.continuations) return contig_tree def is_contig(self, run_idxs): """This method checks that if a given list of run indices is a valid contig or not. """ run_idx_continuations = [np.array([run_idxs[idx+1], run_idxs[idx]]) for idx in range(len(run_idxs)-1)] #gets the contigs array continuations = self.settings_grp['continuations'][:] # checks if sub contigs are in contigs list or not. for run_continuous in run_idx_continuations: contig = False for continuous in continuations: if np.array_equal(run_continuous, continuous): contig = True if not contig: return False return True def contig_trace_to_trace(self, run_idxs, contig_trace): """This method takes a trace of frames over the given contig (run_idxs), itself a list of tuples (traj_idx, cycle_idx) and converts it to a trace valid for actually accessing values from a WepyHDF5 object i.e. a list of tuples (run_idx, traj_idx, cycle_idx). """ # check the contig to make sure it is a valid contig if not self.is_contig(run_idxs): raise ValueError("The run_idxs provided are not a valid contig, {}.".format( run_idxs)) # get the number of cycles in each of the runs in the contig runs_n_cycles = [] for run_idx in run_idxs: runs_n_cycles.append(self.run_n_cycles(run_idx)) # cumulative number of cycles run_cum_cycles = np.cumsum(runs_n_cycles) # add a zero to the beginning for the starting point of no # frames run_cum_cycles = np.hstack( ([0], run_cum_cycles) ) # go through the frames of the contig trace and convert them new_trace = [] for traj_idx, contig_cycle_idx in contig_trace: # get the index of the run that this cycle idx of the # contig is in frame_run_idx = np.searchsorted(run_cum_cycles, contig_cycle_idx, side='right') - 1 # use that to get the total number of frames up until the # point of that and subtract that from the contig cycle # idx to get the cycle index in the run run_frame_idx = contig_cycle_idx - run_cum_cycles[frame_run_idx] # the tuple for this frame for the whole file trace_frame_idx = (frame_run_idx, traj_idx, run_frame_idx) # add it to the trace new_trace.append(trace_frame_idx) return new_trace def records_grp(self, run_idx, run_record_key): path = "runs/{}/{}".format(run_idx, run_record_key) return self.h5[path] def resampling_grp(self, run_idx): return self.records_grp(run_idx, RESAMPLING) def resampler_grp(self, run_idx): return self.records_grp(run_idx, RESAMPLER) def warping_grp(self, run_idx): return self.records_grp(run_idx, WARPING) def bc_grp(self, run_idx): return self.records_grp(run_idx, BC) def progress_grp(self, run_idx): return self.records_grp(run_idx, PROGRESS) def run_records(self, run_idx, run_record_key): # wrap this in a list since the underlying functions accept a # list of records run_idxs = [run_idx] return self.contig_records(run_idxs, run_record_key) def contig_records(self, run_idxs, run_record_key): # if there are no fields return an empty list record_fields = self.record_fields[run_record_key] if len(record_fields) == 0: return [] # get the iterator for the record idxs, if the group is # sporadic then we just use the cycle idxs if self._is_sporadic_records(run_record_key): records = self._run_records_sporadic(run_idxs, run_record_key) else: records = self._run_records_continual(run_idxs, run_record_key) return records # TODO remove def cycle_tree(self): # make a network which has nodes (run_idx, cycle_idx) and the # main attibute is the resampling steps for that cycle cycle_tree = nx.DiGraph() # first go through each run without continuations for run_idx in self.run_idxs: n_cycles = self.run_n_cycles(run_idx) # make all the nodes for this run nodes = [(run_idx, step_idx) for step_idx in range(n_cycles)] cycle_tree.add_nodes_from(nodes) # the same for the edges edge_node_idxs = list(zip(range(1, n_cycles), range(n_cycles - 1))) edges = [(nodes[a], nodes[b]) for a, b in edge_node_idxs] cycle_tree.add_edges_from(edges) # after we have added all the nodes and edges for the run # subgraphs we need to connect them together with the # information in the contig tree. for edge_source, edge_target in self.continuations: # for the source node (the restart run) we use the run_idx # from the edge source node and the index of the first # cycle source_node = (edge_source, 0) # for the target node (the run being continued) we use the # run_idx from the edge_target and the last cycle index in # the run target_node = (edge_target, self.run_n_cycles(edge_target)-1) # make the edge edge = (source_node, target_node) # add this connector edge to the network cycle_tree.add_edge(*edge) return cycle_tree def contig_tree_records(self, run_record_keys): """Get records in the form of a tree for the whole contig tree. Each collection of records for a run will be in the node of the contig tree corresponding to the run. """ contig_tree = self.contig_tree() # just loop through each run and get the records for it then # assign it to the node in the contig tree for run_idx in self.run_idxs: for run_record_key in run_record_keys: contig_tree.nodes[run_idx][run_record_key] = self.run_records(run_idx, run_record_key) return contig_tree def _run_record_namedtuple(self, run_record_key): Record = namedtuple('{}_Record'.format(run_record_key), ['cycle_idx'] + self.record_fields[run_record_key]) return Record def _convert_record_field_to_table_column(self, run_idx, run_record_key, record_field): # get the field dataset rec_grp = self.records_grp(run_idx, run_record_key) dset = rec_grp[record_field] # if it is variable length or if it has more than one element # cast all elements to tuples if h5py.check_dtype(vlen=dset.dtype) is not None: rec_dset = [tuple(value) for value in dset[:]] # if it is not variable length make sure it is not more than a # 1D feature vector elif len(dset.shape) > 2: raise TypeError( "cannot convert fields with feature vectors more than 1 dimension," " was given {} for {}/{}".format( dset.shape[1:], run_record_key, record_field)) # if it is only a rank 1 feature vector and it has more than # one element make a tuple out of it elif dset.shape[1] > 1: rec_dset = [tuple(value) for value in dset[:]] # otherwise just get the single value instead of keeping it as # a single valued feature vector else: rec_dset = [value[0] for value in dset[:]] return rec_dset def _convert_record_fields_to_table_columns(self, run_idx, run_record_key): fields = {} for record_field in self.record_fields[run_record_key]: fields[record_field] = self._convert_record_field_to_table_column( run_idx, run_record_key, record_field) return fields def _make_records(self, run_record_key, cycle_idxs, fields): Record = self._run_record_namedtuple(run_record_key) # for each record we make a tuple and yield it records = [] for record_idx in range(len(cycle_idxs)): # make a record for this cycle record_d = {'cycle_idx' : cycle_idxs[record_idx]} for record_field, column in fields.items(): datum = column[record_idx] record_d[record_field] = datum record = Record(**record_d) records.append(record) return records def _run_records_sporadic(self, run_idxs, run_record_key): # we loop over the run_idxs in the contig and get the fields # and cycle idxs for the whole contig fields = None cycle_idxs = np.array([], dtype=int) # keep a cumulative total of the runs cycle idxs prev_run_cycle_total = 0 for run_idx in run_idxs: # get all the value columns from the datasets, and convert # them to something amenable to a table run_fields = self._convert_record_fields_to_table_columns(run_idx, run_record_key) # we need to concatenate each field to the end of the # field in the master dictionary, first we need to # initialize it if it isn't already made if fields is None: # if it isn't initialized we just set it as this first # run fields dictionary fields = run_fields else: # if it is already initialized we need to go through # each field and concatenate for field_name, field_data in run_fields.items(): # just add it to the list of fields that will be concatenated later fields[field_name].extend(field_data) # get the cycle idxs for this run rec_grp = self.records_grp(run_idx, run_record_key) run_cycle_idxs = rec_grp[CYCLE_IDXS][:] # add the total number of cycles that came before this run # to each of the cycle idxs to get the cycle_idxs in terms # of the full contig run_contig_cycle_idxs = run_cycle_idxs + prev_run_cycle_total # add these cycle indices to the records for the whole contig cycle_idxs = np.hstack( (cycle_idxs, run_contig_cycle_idxs) ) # add the total number of cycle_idxs from this run to the # running total prev_run_cycle_total += self.run_n_cycles(run_idx) # then make the records from the fields records = self._make_records(run_record_key, cycle_idxs, fields) return records def _run_records_continual(self, run_idxs, run_record_key): cycle_idxs = np.array([], dtype=int) fields = None prev_run_cycle_total = 0 for run_idx in run_idxs: # get all the value columns from the datasets, and convert # them to something amenable to a table run_fields = self._convert_record_fields_to_table_columns(run_idx, run_record_key) # we need to concatenate each field to the end of the # field in the master dictionary, first we need to # initialize it if it isn't already made if fields is None: # if it isn't initialized we just set it as this first # run fields dictionary fields = run_fields else: # if it is already initialized we need to go through # each field and concatenate for field_name, field_data in run_fields.items(): # just add it to the list of fields that will be concatenated later fields[field_name].extend(field_data) # get one of the fields (if any to iterate over) record_fields = self.record_fields[run_record_key] main_record_field = record_fields[0] # make the cycle idxs from that run_rec_grp = self.records_grp(run_idx, run_record_key) run_cycle_idxs = list(range(run_rec_grp[main_record_field].shape[0])) # add the total number of cycles that came before this run # to each of the cycle idxs to get the cycle_idxs in terms # of the full contig run_contig_cycle_indices = run_cycle_idxs + prev_run_cycle_total # add these cycle indices to the records for the whole contig cycle_idxs = np.hstack( (cycle_idxs, run_contig_cycle_idxs) ) # add the total number of cycle_idxs from this run to the # running total prev_run_cycle_total += self.run_n_cycles(run_idx) # then make the records from the fields records = self._make_records(run_record_key, cycle_idxs, fields) return records def run_records_dataframe(self, run_idx, run_record_key): records = self.run_records(run_idx, run_record_key) return pd.DataFrame(records) def contig_records_dataframe(self, run_idxs, run_record_key): records = self.contig_records(run_idxs, run_record_key) return pd.DataFrame(records) # application level specific methods for each main group # resampling def resampling_records(self, run_idxs): return self.contig_records(run_idxs, RESAMPLING) def resampling_records_dataframe(self, run_idxs): return pd.DataFrame(self.resampling_records(run_idxs)) # resampler records def resampler_records(self, run_idxs): return self.contig_records(run_idxs, RESAMPLER) def resampler_records_dataframe(self, run_idxs): return pd.DataFrame(self.resampler_records(run_idxs)) # warping def warping_records(self, run_idxs): return self.contig_records(run_idxs, WARPING) def warping_records_dataframe(self, run_idxs): return pd.DataFrame(self.warping_records(run_idxs)) # boundary conditions def bc_records(self, run_idxs): return self.contig_records(run_idxs, BC) def bc_records_dataframe(self, run_idxs): return pd.DataFrame(self.bc_records(run_idxs)) # progress def progress_records(self, run_idxs): return self.contig_records(run_idxs, PROGRESS) def progress_records_dataframe(self, run_idxs): return pd.DataFrame(self.progress_records(run_idxs)) @staticmethod def resampling_panel(resampling_records, is_sorted=False): """Converts a simple collection of resampling records into a list of elements corresponding to cycles. It is like doing a pivot on the step indices into an extra dimension. Hence it can be thought of as a list of tables indexed by the cycle, hence the name panel. """ resampling_panel = [] # if the records are not sorted this must be done: if not is_sorted: resampling_records.sort() # iterate through the resampling records rec_it = iter(resampling_records) cycle_idx = 0 cycle_recs = [] stop = False while not stop: # iterate through records until either there is none left or # until you get to the next cycle cycle_stop = False while not cycle_stop: try: rec = next(rec_it) except StopIteration: # this is the last record of all the records stop = True # this is the last record for the last cycle as well cycle_stop = True # alias for the current cycle curr_cycle_recs = cycle_recs else: # if the resampling record retrieved is from the next # cycle we finish the last cycle if rec.cycle_idx > cycle_idx: cycle_stop = True # save the current cycle as a special # list which we will iterate through # to reduce down to the bare # resampling record curr_cycle_recs = cycle_recs # start a new cycle_recs for the record # we just got cycle_recs = [rec] cycle_idx += 1 if not cycle_stop: cycle_recs.append(rec) else: # we need to break up the records in the cycle into steps cycle_table = [] # temporary container for the step we are working on step_recs = [] step_idx = 0 step_stop = False cycle_it = iter(curr_cycle_recs) while not step_stop: try: cycle_rec = next(cycle_it) # stop the step if this is the last record for the cycle except StopIteration: step_stop = True # alias for the current step curr_step_recs = step_recs # or if the next stop index has been obtained else: #if cycle_rec[RESAMPLING_RECORD_FIELDS.index(STEP)] > step_idx: if cycle_rec.step_idx > step_idx: step_stop = True # save the current step as a special # list which we will iterate through # to reduce down to the bare # resampling record curr_step_recs = step_recs # start a new step_recs for the record # we just got step_recs = [cycle_rec] step_idx += 1 if not step_stop: step_recs.append(cycle_rec) else: # go through the walkers for this step since it is completed step_row = [None for i in range(len(curr_step_recs))] for walker_rec in curr_step_recs: # collect data from the record walker_idx = walker_rec.walker_idx decision_id = walker_rec.decision_id instruction = walker_rec.target_idxs # set the resampling record for the walker in the step records step_row[walker_idx] = (decision_id, instruction) # add the records for this step to the cycle table cycle_table.append(step_row) # add the table for this cycles records to the parent panel resampling_panel.append(cycle_table) return resampling_panel def run_resampling_panel(self, run_idx): return self.contig_resampling_panel([run_idx]) def contig_resampling_panel(self, run_idxs): # check the contig to make sure it is a valid contig if not self.is_contig(run_idxs): raise ValueError("The run_idxs provided are not a valid contig, {}.".format( run_idxs)) # make the resampling panel from the resampling records for the contig contig_resampling_panel = self.resampling_panel(self.resampling_records(run_idxs)) return contig_resampling_panel # TODO remove def cycle_tree_resampling_panel(self): """Instead of a resampling panel this is the same thing except each cycle is a node rather than a table in the panel. """ # get the cycle tree cycle_tree = self.cycle_tree() # then get the resampling tables for each cycle and put them # as attributes to the appropriate nodes for run_idx in self.run_idxs: run_resampling_panel = self.run_resampling_panel(run_idx) # add each cycle of this panel to the network by adding # them in as nodes with the resampling steps first for step_idx, step in enumerate(run_resampling_panel): node = (run_idx, step_idx) cycle_tree.nodes[node]["resampling_steps"] = step return cycle_tree def join(self, other_h5): """Given another WepyHDF5 file object does a left join on this file. Renumbering the runs starting from this file. """ with other_h5 as h5: for run_idx in h5.run_idxs: # the other run group handle other_run = h5.run(run_idx) # copy this run to this file in the next run_idx group self.h5.copy(other_run, 'runs/{}'.format(self.next_run_idx())) def to_mdtraj(self, run_idx, traj_idx, frames=None, alt_rep=None): traj_grp = self.traj(run_idx, traj_idx) # the default for alt_rep is the main rep if alt_rep is None: rep_key = POSITIONS rep_path = rep_key else: rep_key = alt_rep rep_path = 'alt_reps/{}'.format(alt_rep) topology = self.get_mdtraj_topology(alt_rep=rep_key) frames = self.get_traj_field_cycle_idxs(run_idx, traj_idx, rep_path) # get the data for all or for the frames specified positions = self.get_traj_field(run_idx, traj_idx, rep_path, frames=frames, masked=False) try: time = self.get_traj_field(run_idx, traj_idx, TIME, frames=frames, masked=False)[:, 0] except KeyError: warn("time not in this trajectory, ignoring") time = None try: box_vectors = self.get_traj_field(run_idx, traj_idx, BOX_VECTORS, frames=frames, masked=False) except KeyError: warn("box_vectors not in this trajectory, ignoring") box_vectors = None if box_vectors is not None: unitcell_lengths, unitcell_angles = traj_box_vectors_to_lengths_angles(box_vectors) if (box_vectors is not None) and (time is not None): traj = mdj.Trajectory(positions, topology, time=time, unitcell_lengths=unitcell_lengths, unitcell_angles=unitcell_angles) elif box_vectors is not None: traj = mdj.Trajectory(positions, topology, unitcell_lengths=unitcell_lengths, unitcell_angles=unitcell_angles) elif time is not None: traj = mdj.Trajectory(positions, topology, time=time) else: traj = mdj.Trajectory(positions, topology) return traj def trace_to_mdtraj(self, trace, alt_rep=None): # the default for alt_rep is the main rep if alt_rep is None: rep_key = POSITIONS rep_path = rep_key else: rep_key = alt_rep rep_path = 'alt_reps/{}'.format(alt_rep) topology = self.get_mdtraj_topology(alt_rep=rep_key) trace_fields = self.get_trace_fields(trace, [rep_path, BOX_VECTORS]) unitcell_lengths, unitcell_angles = traj_box_vectors_to_lengths_angles( trace_fields[BOX_VECTORS]) cycles = [cycle for run, cycle, walker in trace] traj = mdj.Trajectory(trace_fields[rep_key], topology, time=cycles, unitcell_lengths=unitcell_lengths, unitcell_angles=unitcell_angles) return traj def run_trace_to_mdtraj(self, run_idx, trace, alt_rep=None): # the default for alt_rep is the main rep if alt_rep is None: rep_key = POSITIONS rep_path = rep_key else: rep_key = alt_rep rep_path = 'alt_reps/{}'.format(alt_rep) topology = self.get_mdtraj_topology(alt_rep=rep_key) trace_fields = self.get_run_trace_fields(run_idx, trace, [rep_path, BOX_VECTORS]) unitcell_lengths, unitcell_angles = traj_box_vectors_to_lengths_angles( trace_fields[BOX_VECTORS]) cycles = [cycle for cycle, walker in trace] traj = mdj.Trajectory(trace_fields[rep_key], topology, time=cycles, unitcell_lengths=unitcell_lengths, unitcell_angles=unitcell_angles) return traj def _check_data_compliance(traj_data, compliance_requirements=COMPLIANCE_REQUIREMENTS): """Given a dictionary of trajectory data it returns the COMPLIANCE_TAGS that the data satisfies. """ # cast the nested tuples to a dictionary if necessary compliance_dict = dict(compliance_requirements) fields = set() for field, value in traj_data.items(): # don't check observables if field in [OBSERVABLES]: continue # check to make sure the value actually has something in it if (value is not None) and len(value) > 0: fields.update([field]) compliances = [] for compliance_tag, compliance_fields in compliance_dict.items(): compliance_fields = set(compliance_fields) # if the fields are a superset of the compliance fields for # this compliance type then it satisfies it if fields.issuperset(compliance_fields): compliances.append(compliance_tag) return compliances def iter_field_paths(grp): field_paths = [] for field_name in grp: if isinstance(grp[field_name], h5py.Group): for subfield in grp[field_name]: field_paths.append(field_name + '/' + subfield) else: field_paths.append(field_name) return field_paths class RunCycleSlice(object): def __init__(self, run_idx, cycles, wepy_hdf5_file): self._h5 = wepy_hdf5_file._h5 self.run_idx = run_idx self.cycles = cycles self.mode = wepy_hdf5_file.mode def traj(self, run_idx, traj_idx): return self._h5['runs/{}/trajectories/{}'.format(run_idx, traj_idx)] def run_trajs(self, run_idx): return self._h5['runs/{}/trajectories'.format(run_idx)] def n_run_trajs(self, run_idx): return len(self._h5['runs/{}/trajectories'.format(run_idx)]) def run_traj_idxs(self, run_idx): return range(len(self._h5['runs/{}/trajectories'.format(run_idx)])) def run_traj_idx_tuples(self): tups = [] for traj_idx in self.run_traj_idxs(self.run_idx): tups.append((self.run_idx, traj_idx)) return tups def run_cycle_idx_tuples(self): tups = [] for cycle_idx in self.cycles: tups.append((self.run_idx, cycle_idx)) return tups def iter_trajs(self, idxs=False, traj_sel=None): """Generator for all of the trajectories in the dataset across all runs. If idxs=True will return a tuple of (run_idx, traj_idx). run_sel : if True will iterate over a subset of trajectories. Possible values are an iterable of `(run_idx, traj_idx)` tuples. """ # set the selection of trajectories to iterate over if traj_sel is None: idx_tups = self.run_traj_idx_tuples() else: idx_tups = traj_sel # get each traj for each idx_tup and yield them for the generator for run_idx, traj_idx in idx_tups: traj = self.traj(run_idx, traj_idx) if idxs: yield (run_idx, traj_idx), traj else: yield traj def iter_cycle_fields(self, fields, cycle_idx, idxs=False, traj_sel=None): """Generator for all of the specified non-compound fields h5py.Datasets for all trajectories in the dataset across all runs. Fields is a list of valid relative paths to datasets in the trajectory groups. """ for field in fields: dsets = {} fields_data = () for idx_tup, traj in self.iter_trajs(idxs=True, traj_sel=traj_sel): run_idx, traj_idx = idx_tup try: dset = traj[field][cycle_idx] if not isinstance(dset, np.ndarray): dset = np.array([dset]) if len(dset.shape)==1: fields_data += (dset,) else: fields_data += ([dset],) except KeyError: warn("field \"{}\" not found in \"{}\"".format(field, traj.name), RuntimeWarning) dset = None dsets = np.concatenate(fields_data, axis=0) if idxs: yield (run_idx, traj_idx, field), dsets else: yield field, dsets def iter_cycles_fields(self, fields, idxs=False, traj_sel=None): for cycle_idx in self.cycles: dsets = {} for field, dset in self.iter_cycle_fields(fields, cycle_idx, traj_sel=traj_sel): dsets[field] = dset if idxs: yield (cycle_idx, dsets) else: yield dsets def traj_cycles_map(self, func, fields, *args, map_func=map, idxs=False, traj_sel=None): """Function for mapping work onto field of trajectories in the WepyHDF5 file object. Similar to traj_map, except `h5py.Group` objects cannot be pickled for message passing. So we select the fields to serialize instead and pass the `numpy.ndarray`s to have the work mapped to them. func : the function that will be mapped to trajectory groups fields : list of fields that will be serialized into a dictionary and passed to the map function. These must be valid `h5py` path strings relative to the trajectory group. These include the standard fields like 'positions' and 'weights', as well as compound paths e.g. 'observables/sasa'. map_func : the function that maps the function. This is where parallelization occurs if desired. Defaults to the serial python map function. traj_sel : a trajectory selection. This is a valid `traj_sel` argument for the `iter_trajs` function. *args : additional arguments to the function. If this is an iterable it will be assumed that it is the appropriate length for the number of trajectories, WARNING: this will not be checked and could result in a run time error. Otherwise single values will be automatically mapped to all trajectories. """ # check the args and kwargs to see if they need expanded for # mapping inputs mapped_args = [] for arg in args: # if it is a sequence or generator we keep just pass it to the mapper if isinstance(arg, list) and not isinstance(arg, str): assert len(arg) == len(self.cycles), "Sequence has fewer" mapped_args.append(arg) # if it is not a sequence or generator we make a generator out # of it to map as inputs else: mapped_arg = (arg for i in range(len(self.cycles))) mapped_args.append(mapped_arg) results = map_func(func, self.iter_cycles_fields(fields, traj_sel=traj_sel, idxs=False), *mapped_args) if idxs: if traj_sel is None: traj_sel = self.run_cycle_idx_tuples() return zip(traj_sel, results) else: return results def compute_observable(self, func, fields, *args, map_func=map, traj_sel=None, save_to_hdf5=None, idxs=False, return_results=True): """Compute an observable on the trajectory data according to a function. Optionally save that data in the observables data group for the trajectory. """ if save_to_hdf5 is not None: assert self.mode in ['w', 'w-', 'x', 'r+', 'c', 'c-'],\ "File must be in a write mode" assert isinstance(save_to_hdf5, str),\ "`save_to_hdf5` should be the field name to save the data in the `observables` group in each trajectory" field_name=save_to_hdf5 #DEBUG enforce this until sparse trajectories are implemented assert traj_sel is None, "no selections until sparse trajectory data is implemented" idx =0 for result in self.traj_cycles_map(func, fields, *args, map_func=map_func, traj_sel=traj_sel, idxs=True): idx_tup, obs_value = result run_idx, traj_idx = idx_tup # if we are saving this to the trajectories observables add it as a dataset if save_to_hdf5: logging.info("Saving run {} traj {} observables/{}".format( run_idx, traj_idx, field_name)) # try to get the observables group or make it if it doesn't exist try: obs_grp = self.traj(run_idx, traj_idx)[OBSERVABLES] except KeyError: logging.info("Group uninitialized. Initializing.") obs_grp = self.traj(run_idx, traj_idx).create_group(OBSERVABLES) # try to create the dataset try: obs_grp.create_dataset(field_name, data=obs_value) # if it fails we either overwrite or raise an error except RuntimeError: # if we are in a permissive write mode we delete the # old dataset and add the new one, overwriting old data if self.mode in ['w', 'w-', 'x', 'r+']: logging.info("Dataset already present. Overwriting.") del obs_grp[field_name] obs_grp.create_dataset(field_name, data=obs_value) # this will happen in 'c' and 'c-' modes else: raise RuntimeError( "Dataset already exists and file is in concatenate mode ('c' or 'c-')") # also return it if requested if return_results: if idxs: yield idx_tup, obs_value else: yield obs_value PK!wepy/orchestration/__init__.pyPK!FO5(5(wepy/orchestration/cli.pyimport os.path as osp import logging import click from wepy.orchestration.orchestrator import deserialize_orchestrator, \ reconcile_orchestrators, \ Orchestrator, \ recover_run_by_time from wepy.reporter.hdf5 import WepyHDF5Reporter from wepy.hdf5 import WepyHDF5 ORCHESTRATOR_DEFAULT_FILENAME = \ Orchestrator.ORCH_FILENAME_TEMPLATE.format(config=Orchestrator.DEFAULT_CONFIG_NAME, narration=Orchestrator.DEFAULT_NARRATION) @click.group() def cli(): pass def set_loglevel(loglevel): # try to cast the loglevel as an integer. If that fails interpret # it as a string. try: loglevel_num = int(loglevel) except ValueError: loglevel_num = getattr(logging, loglevel, None) # if no such log level exists in logging the string was invalid if loglevel_num is None: raise ValueError("invalid log level given") logging.basicConfig(level=loglevel_num) START_HASH = '' CURDIR = '' def settle_run_options(n_workers=None, job_dir=None, job_name=None, narration=None): # the default for the job name is the start hash if none is given if job_name == START_HASH: job_name = start_hash # if the job_name is given and the default value for th job_dir is # given we set the job-dir as the job_name if job_name is not None and job_dir == CURDIR: job_dir = job_name # if the special value for curdir is given we get the systems # current directory, this is the default. if job_dir == CURDIR: job_dir = osp.curdir # normalize the job_dir job_dir = osp.realpath(job_dir) return n_workers, job_dir, job_name, narration @click.option('--log', default="WARNING") @click.option('--n-workers', type=click.INT) @click.option('--checkpoint-freq', default=None, type=click.INT) @click.option('--job-dir', default=CURDIR, type=click.Path(writable=True)) @click.option('--job-name', default=START_HASH) @click.option('--narration', default="") @click.argument('n_cycle_steps', type=click.INT) @click.argument('run_time', type=click.FLOAT) @click.argument('start_hash') @click.argument('orchestrator', type=click.File(mode='rb')) @click.command() def run(log, n_workers, checkpoint_freq, job_dir, job_name, narration, n_cycle_steps, run_time, start_hash, orchestrator): set_loglevel(log) # settle what the defaults etc. are for the different options as they are interdependent n_workers, job_dir, job_name, narration = settle_run_options(n_workers=n_workers, job_dir=job_dir, job_name=job_name, narration=narration) orch = deserialize_orchestrator(orchestrator.read()) logging.info("Orchestrator loaded") start_hash, end_hash = orch.orchestrate_snapshot_run_by_time(start_hash, run_time, n_cycle_steps, checkpoint_freq=checkpoint_freq, work_dir=job_dir, config_name=job_name, narration=narration, n_workers=n_workers) # write the run tuple out to the log run_line_str = "Run start and end hashes: {}, {}".format(start_hash, end_hash) # log it logging.info(run_line_str) # also put it to the terminal click.echo(run_line_str) @click.option('--log', default="WARNING") @click.option('--n-workers', type=click.INT) @click.option('--checkpoint-freq', default=None, type=click.INT) @click.option('--job-dir', default=CURDIR, type=click.Path(writable=True)) @click.option('--job-name', default=START_HASH) @click.option('--narration', default="recovery") @click.argument('n_cycle_steps', type=click.INT) @click.argument('run_time', type=click.FLOAT) @click.argument('checkpoint', type=click.File(mode='rb')) @click.argument('start_hash') @click.argument('orchestrator', type=click.File(mode='rb')) @click.command() def recover(log, n_workers, checkpoint_freq, job_dir, job_name, narration, n_cycle_steps, run_time, checkpoint, start_hash, orchestrator): set_loglevel(log) n_workers, job_dir, job_name, narration = settle_run_options(n_workers=n_workers, job_dir=job_dir, job_name=job_name, narration=narration) orch = deserialize_orchestrator(orchestrator.read()) logging.info("Orchestrator loaded") checkpoint_orch = deserialize_orchestrator(checkpoint.read()) logging.info("Checkpoint loadeded") # run the continuation from the new orchestrator with the update # from the checkpoint new_orch, run_tup = recover_run_by_time(orch, checkpoint_orch, run_time, n_cycle_steps, checkpoint_freq=checkpoint_freq, work_dir=job_dir, config_name=job_name, narration=narration) start_hash, end_hash = run_tup # write the run tuple out to the log run_line_str = "Run start and end hashes: {}, {}".format(start_hash, end_hash) # log it logging.info(run_line_str) # also put it to the terminal click.echo(run_line_str) def combine_orch_wepy_hdf5s(new_orch, new_hdf5_path): # a key-value for the paths for each run hdf5_paths = {} # go through each run in the new orchestrator for run_id in new_orch.runs: # get the configuration used for this run run_config = new_orch.run_configuration(*run_id) # from that configuration find the WepyHDF5Reporters for reporter in run_config.reporters: if isinstance(reporter, WepyHDF5Reporter): # and save the path for that run hdf5_paths[run_id] = reporter.file_path # now that we have the paths (or lack of paths) for all # the runs we need to start linking them all # together. # first we need a master linker HDF5 to do this with # so load a template WepyHDF5 template_wepy_h5_path = hdf5_paths[new_orch.runs[0]] template_wepy_h5 = WepyHDF5(template_wepy_h5_path, mode='r') # clone it with template_wepy_h5: master_wepy_h5 = template_wepy_h5.clone(new_hdf5_path, mode='x') with master_wepy_h5: # then link all the files to it run_mapping = {} for run_id, wepy_h5_path in hdf5_paths.items(): # we just link the whole file then sort out the # continuations later since we aren't necessarily doing # this in a logical order new_run_idxs = master_wepy_h5.link_file_runs(wepy_h5_path) # map the hash id to the new run idx created. There should # only be one if we are following the orchestration # workflow. run_mapping[run_id] = new_run_idxs[0] # now that they are all linked we need to set the # continuations correctly, so for each run we find the run it # continues in the orchestrator for run_id, run_idx in run_mapping.items(): # find the run_id that this one continues continued_run_id = new_orch.run_continues(*run_id) # if a None is returned then there was no continuation if continued_run_id is None: # so we go to the next run_id and don't log any # continuation continue # get the run_idx in the HDF5 that corresponds to this run continued_run_idx = run_mapping[continued_run_id] # add the continuation master_wepy_h5.add_continuation(run_idx, continued_run_idx) @click.command() @click.option('--hdf5', type=click.Path(exists=False)) @click.argument('output', nargs=1, type=click.File(mode='wb')) @click.argument('orchestrators', nargs=-1, type=click.File(mode='rb')) def reconcile(hdf5, output, orchestrators): # reconcile them one by one as they are big and too expensive to # load all into memory at once new_orch = deserialize_orchestrator(orchestrators[0].read()) for orchestrator in orchestrators[1:]: orch = deserialize_orchestrator(orchestrator.read()) # reconcile the two orchestrators new_orch = reconcile_orchestrators(new_orch, orch) # if a path for an HDF5 file is given if hdf5 is not None: hdf5_path = osp.realpath(hdf5) # combine the HDF5 files from those orchestrators combine_orch_wepy_hdf5s(new_orch, hdf5_path) # then make and output the orchestrator output.write(new_orch.serialize()) def hash_listing_formatter(hashes): hash_listing_str = '\n'.join(hashes) return hash_listing_str @click.argument('orchestrator', type=click.File(mode='rb')) @click.command() def ls_snapshots(orchestrator): orch = deserialize_orchestrator(orchestrator.read()) message = hash_listing_formatter(orch.snapshot_hashes) click.echo(message) @click.argument('orchestrator', type=click.File(mode='rb')) @click.command() def ls_runs(orchestrator): orch = deserialize_orchestrator(orchestrator.read()) runs = orch.runs hash_listing_str = "\n".join(["{}, {}".format(start, end) for start, end in runs]) click.echo(hash_listing_str) # command groupings cli.add_command(run) cli.add_command(recover) cli.add_command(reconcile) cli.add_command(ls_snapshots) cli.add_command(ls_runs) if __name__ == "__main__": cli() PK!{Hh))#wepy/orchestration/configuration.pyimport os.path as osp from copy import deepcopy import logging from wepy.work_mapper.mapper import Mapper, WorkerMapper from wepy.work_mapper.worker import Worker class Configuration(): DEFAULT_WORKDIR = osp.realpath(osp.curdir) DEFAULT_CONFIG_NAME = "root" DEFAULT_NARRATION = "" DEFAULT_MODE = 'x' def __init__(self, # reporters config_name=None, work_dir=None, mode=None, narration=None, reporter_classes=None, reporter_partial_kwargs=None, # work mappers n_workers=None, worker_type=None, work_mapper_class=None, work_mapper_partial_kwargs=None): ## reporter stuff # reporters and partial kwargs if reporter_classes is not None: self._reporter_classes = reporter_classes else: self._reporter_classes = [] if reporter_partial_kwargs is not None: self._reporter_partial_kwargs = reporter_partial_kwargs else: self._reporter_partial_kwargs = [] # file path localization variables # config string if config_name is not None: self._config_name = config_name else: self._config_name = self.DEFAULT_CONFIG_NAME if work_dir is not None: self._work_dir = work_dir else: self._work_dir = self.DEFAULT_WORKDIR # narration if narration is not None: narration = "_{}".format(narration) if len(narration) > 0 else "" self._narration = narration else: self._narration = self.DEFAULT_NARRATION # file modes, if none are given we set to the default, this # needs to be done before generating the reporters if mode is not None: self._mode = mode else: self._mode = self.DEFAULT_MODE # generate the reporters for this configuration self._reporters = self._gen_reporters() ## work mapper # the partial kwargs that will be passed for reparametrization if work_mapper_partial_kwargs is None: self._work_mapper_partial_kwargs = {} else: self._work_mapper_partial_kwargs = work_mapper_partial_kwargs # if the number of workers was sepcified and no work_mapper # class was specified default to the WorkerMapper if (n_workers is not None) and (work_mapper_class is None): self._n_workers = n_workers self._work_mapper_class = WorkerMapper # if no number of workers was specified and no work_mapper # class was specified we default to the serial mapper elif (n_workers is None) and (work_mapper_class is None): self._n_workers = None self._work_mapper_class = Mapper # otherwise if the work_mapper class was given we use it and # whatever the number of workers was else: self._n_workers = n_workers self._work_mapper_class = work_mapper_class # the default worker type if none was given if worker_type is None: worker_type = Worker # set the worker type self._worker_type = worker_type # then generate a work mapper self._work_mapper = self._work_mapper_class(num_workers=self._n_workers, worker_type=self._worker_type, **self._work_mapper_partial_kwargs) @property def reporter_classes(self): return self._reporter_classes @property def reporter_partial_kwargs(self): return self._reporter_partial_kwargs @property def config_name(self): return self._config_name @property def work_dir(self): return self._work_dir @property def narration(self): return self._narration @property def mode(self): return self._mode @property def reporters(self): return self._reporters @property def work_mapper_class(self): return self._work_mapper_class @property def work_mapper_partial_kwargs(self): return self._work_mapper_partial_kwargs @property def n_workers(self): return self._n_workers @property def worker_type(self): return self._worker_type @property def work_mapper(self): return self._work_mapper def _gen_reporters(self): reporters = [] for idx, reporter_class in enumerate(self.reporter_classes): filename = reporter_class.SUGGESTED_FILENAME_TEMPLATE.format( narration=self.narration, config=self.config_name, ext=reporter_class.SUGGESTED_EXTENSION) file_path = osp.join(self.work_dir, filename) file_paths = [file_path] modes = [self.mode for i in range(len(file_paths))] reporter = reporter_class(file_paths=file_paths, modes=modes, **self.reporter_partial_kwargs[idx]) reporters.append(reporter) return reporters def _gen_work_mapper(self): work_mapper = self._work_mapper_class(n_workers=self._n_workers) return work_mapper @property def reporters(self): return deepcopy(self._reporters) @property def work_mapper(self): return deepcopy(self._work_mapper) def reparametrize(self, **kwargs): #import ipdb; ipdb.set_trace() # dictionary of the possible reparametrizations from the # current configuration params = {# related to the work mapper 'n_workers' : self.n_workers, 'work_mapper_class' : self.work_mapper_class, 'work_mapper_partial_kwargs' : self.work_mapper_partial_kwargs, # those related to the reporters 'mode' : self.mode, 'config_name' : self.config_name, 'work_dir' : self.work_dir, 'narration' : self.narration, 'reporter_classes' : self.reporter_classes, 'reporter_partial_kwargs' : self.reporter_partial_kwargs} for key, value in kwargs.items(): # if the value is given we replace the old one with it if value is not None: params[key] = value new_configuration = type(self)(**params) return new_configuration PK!|N @cc"wepy/orchestration/orchestrator.pyfrom copy import copy, deepcopy from hashlib import md5 import time import os import os.path as osp from base64 import b64encode, b64decode from zlib import compress, decompress import itertools as it import logging # instead of pickle we use dill, so we can save dynamically defined # classes import dill from wepy.sim_manager import Manager from wepy.orchestration.configuration import Configuration class SimApparatus(): """The simulation apparatus are the components needed for running a simulation without the initial conditions for starting the simulation. A runner is strictly necessary but a resampler and boundary conditions are not. """ def __init__(self, filters): self._filters = deepcopy(filters) @property def filters(self): return self._filters class WepySimApparatus(SimApparatus): def __init__(self, runner, resampler=None, boundary_conditions=None): # add them in the order they are done in Wepy filters = [runner] if boundary_conditions is not None: filters.append(boundary_conditions) if resampler is not None: filters.append(resampler) super().__init__(filters) class SimSnapshot(): def __init__(self, walkers, apparatus): self._walkers = deepcopy(walkers) self._apparatus = deepcopy(apparatus) @property def walkers(self): return self._walkers @property def apparatus(self): return self._apparatus class OrchestratorError(Exception): pass class Orchestrator(): # we freeze the pickle protocol for making hashes, because we care # more about stability than efficiency of newer versions HASH_PICKLE_PROTOCOL = 2 DEFAULT_WORKDIR = Configuration.DEFAULT_WORKDIR DEFAULT_CONFIG_NAME = Configuration.DEFAULT_CONFIG_NAME DEFAULT_NARRATION = Configuration.DEFAULT_NARRATION DEFAULT_MODE = Configuration.DEFAULT_MODE DEFAULT_CHECKPOINT_FILENAME = "checkpoint.chk" ORCH_FILENAME_TEMPLATE = "{config}{narration}.orch" DEFAULT_ORCHESTRATION_MODE = 'xb' def __init__(self, sim_apparatus, default_init_walkers=None, default_configuration=None): # the main dictionary of snapshots keyed by their hashes self._snapshots = {} # the list of "runs" which are tuples of hashes for the starts # and ends of runs. THis just excludes the checkpoints, this # really is for convenience so you can ignore all the # checkpoints when reconstructing full run continuations etc. self._runs = set() # the apparatus for the simulation. This is the configuration # and initial conditions independent components necessary for # running a simulation. The primary (and only necessary) # component is the runner. The other components when passed # here are used as the defaults when unspecified later # (e.g. resampler and boundary conditions) self._apparatus = deepcopy(sim_apparatus) # if a configuration was given use this as the default # configuration, if none is given then we will use the default # one if default_configuration is not None: self._configuration = deepcopy(default_configuration) # TODO: make a default configuration class here else: self._configuration = None # we also need to save the configurations for each run self._run_configurations = {} # if initial walkers were given we save them and also make a # snapshot for them if default_init_walkers is not None: self._start_hash = self.gen_start_snapshot(default_init_walkers) def serialize(self): serial_str = dill.dumps(self, recurse=True) return serial_str @classmethod def deserialize(cls, serial_str): orch = dill.loads(serial_str) return orch @classmethod def load(cls, filepath, mode='rb'): with open(filepath, mode) as rf: orch = cls.deserialize(rf.read()) return orch def dump(self, filepath, mode=None): if mode is None: mode = self.DEFAULT_ORCHESTRATION_MODE with open(filepath, mode) as wf: wf.write(self.serialize()) @classmethod def encode(cls, obj): # used for both snapshots and apparatuses even though they # themselves have different methods in the API # we use dill to dump to a string and we always do a deepcopy # of the object to avoid differences in the resulting pickle # object from having multiple references, then we encode in 64 bit return b64encode(compress(dill.dumps(deepcopy(obj), protocol=cls.HASH_PICKLE_PROTOCOL, recurse=True))) @classmethod def decode(cls, encoded_str): return dill.loads(decompress(b64decode(encoded_str))) @classmethod def hash(cls, serial_str): return md5(serial_str).hexdigest() @classmethod def serialize_snapshot(cls, snapshot): return cls.encode(snapshot) def hash_snapshot(self, snapshot): serialized_snapshot = self.serialize_snapshot(snapshot) return self.hash(serialized_snapshot) def get_snapshot(self, snapshot_hash): """Returns a copy of a snapshot.""" return deepcopy(self._snapshots[snapshot_hash]) @property def snapshots(self): return deepcopy(list(self._snapshots.values())) @property def snapshot_hashes(self): return list(self._snapshots.keys()) @property def default_snapshot_hash(self): return self._start_hash @property def default_snapshot(self): return self.get_snapshot(self.default_snapshot_hash) @property def default_init_walkers(self): return self.default_snapshot.walkers @property def default_apparatus(self): return self._apparatus @property def default_configuration(self): return self._configuration def snapshot_registered(self, snapshot): snapshot_md5 = self.hash_snapshot(snapshot) if any([True if snapshot_md5 == h else False for h in self.snapshot_hashes]): return True else: return False def snapshot_hash_registered(self, snapshot_hash): if any([True if snapshot_hash == h else False for h in self.snapshot_hashes]): return True else: return False @property def runs(self): return list(deepcopy(self._runs)) def run_configuration(self, start_hash, end_hash): return deepcopy(self._run_configurations[(start_hash, end_hash)]) def _add_snapshot(self, snaphash, snapshot): # check that the hash is not already in the snapshots if any([True if snaphash == md5 else False for md5 in self.snapshot_hashes]): # just skip the rest of the function and return the hash return snaphash self._snapshots[snaphash] = snapshot return snaphash def _gen_checkpoint_orch(self, start_hash, checkpoint_snapshot, configuration): # make an orchestrator with the only run going from the start # hash snapshot to the checkpoint start_snapshot = self.get_snapshot(start_hash) checkpoint_orch = type(self)(start_snapshot.apparatus) # add the the starting snapshot to the orchestrator, we do # this the sneaky way because I am worried about hash # stability and we need to preserve the intial hash, so we # force the hash to be the start_hash and add the object regardless checkpoint_orch._add_snapshot(start_hash, deepcopy(start_snapshot)) # then add a run to this checkpoint orchestrator by adding the # checkpoint snapshot and registering the run checkpoint_hash = checkpoint_orch.add_snapshot(checkpoint_snapshot) # register the run with the two hashes and the configuration checkpoint_orch.register_run(start_hash, checkpoint_hash, configuration) return checkpoint_orch def _save_checkpoint(self, start_hash, checkpoint_snapshot, configuration, checkpoint_dir, mode='wb'): if 'b' not in mode: mode = mode + 'b' # make a checkpoint object which is an orchestrator with only # 1 run in it which is the start and the checkpoint as its end checkpoint_orch = self._gen_checkpoint_orch(start_hash, checkpoint_snapshot, configuration) # construct the checkpoint filename from the template using # the hashes for the start and the checkpoint, we add a "new"" # at the end of the file to indicate that it was just written # and if the other checkpoint is not removed you will be able # to tell them apart, this will get renamed without the new # once the other checkpoint is deleted successfully new_checkpoint_filename = self.DEFAULT_CHECKPOINT_FILENAME + "new" new_checkpoint_path = osp.join(checkpoint_dir, new_checkpoint_filename) # write out the pickle to the file with open(new_checkpoint_path, mode) as wf: wf.write(checkpoint_orch.serialize()) # the path that the checkpoint should be existing checkpoint_path = osp.join(checkpoint_dir, self.DEFAULT_CHECKPOINT_FILENAME) # only after the writing is complete do we delete the old # checkpoint, if there are any to delete if osp.exists(checkpoint_path): os.remove(checkpoint_path) # then rename the one with "new" at the end to the final path os.rename(new_checkpoint_path, checkpoint_path) @classmethod def load_snapshot(cls, file_handle): return cls.decode(file_handle.read()) @classmethod def dump_snapshot(cls, snapshot, file_handle): file_handle.write(cls.serialize_snapshot(snapshot)) def add_snapshot(self, snapshot): # copy the snapshot snapshot = deepcopy(snapshot) # get the hash of the snapshot snaphash = self.hash_snapshot(snapshot) return self._add_snapshot(snaphash, snapshot) def gen_start_snapshot(self, init_walkers): # make a SimSnapshot object using the initial walkers and start_snapshot = SimSnapshot(init_walkers, self.default_apparatus) # save the snapshot, and generate its hash sim_start_md5 = self.add_snapshot(start_snapshot) return sim_start_md5 def gen_sim_manager(self, start_snapshot, configuration): # copy the snapshot to use for the sim_manager start_snapshot = deepcopy(start_snapshot) # construct the sim manager, in a wepy specific way sim_manager = Manager(start_snapshot.walkers, runner=start_snapshot.apparatus.filters[0], boundary_conditions=start_snapshot.apparatus.filters[1], resampler=start_snapshot.apparatus.filters[2], # configuration options work_mapper=configuration.work_mapper, reporters=configuration.reporters) return sim_manager def register_run(self, start_hash, end_hash, configuration): # check that the hashes are for snapshots in the orchestrator # if one is not registered raise an error if not self.snapshot_hash_registered(start_hash): raise OrchestratorError( "snapshot start_hash {} is not registered with the orchestrator".format( start_hash)) if not self.snapshot_hash_registered(end_hash): raise OrchestratorError( "snapshot end_hash {} is not registered with the orchestrator".format( end_hash)) # if they both are registered register the segment self._runs.add((start_hash, end_hash)) # add the configuration for this run self._run_configurations[(start_hash, end_hash)] = configuration def new_run_by_time(self, init_walkers, run_time, n_steps, configuration=None, checkpoint_freq=None, checkpoint_dir=None): """Start a new run that will go for a certain amount of time given a new set of initial conditions. """ # make the starting snapshot from the walkers start_hash = self.gen_start_snapshot(init_walkers) # then perform a run with the checkpoints etc using the # dedicated method that works on snapshots return self.run_snapshot_by_time(start_hash, run_time, n_steps, configuration=configuration, checkpoint_freq=checkpoint_freq, checkpoint_dir=checkpoint_dir) def run_snapshot_by_time(self, start_hash, run_time, n_steps, checkpoint_freq=None, checkpoint_dir=None, configuration=None, mode=None): """For a finished run continue it but resetting all the state of the resampler and boundary conditions""" # check that the directory for checkpoints exists, and create # it if it doesn't and isn't already created if checkpoint_dir is not None: checkpoint_dir = osp.realpath(checkpoint_dir) os.makedirs(checkpoint_dir, exist_ok=True) if mode is None: mode = self.DEFAULT_MODE dump_mode = mode if 'b' not in dump_mode: dump_mode = mode + 'b' if configuration is None: configuration = deepcopy(self.default_configuration) # get the snapshot start_snapshot = self.get_snapshot(start_hash) # generate the simulation manager given the snapshot and the # configuration sim_manager = self.gen_sim_manager(start_snapshot, configuration=configuration) # run the init subroutine sim_manager.init() # keep a running list of the checkpoints for this run self._curr_run_checkpoints = [] # run each cycle manually creating checkpoints when necessary walkers = start_snapshot.walkers cycle_idx = 0 start_time = time.time() while time.time() - start_time < run_time: # run the cycle walkers, filters = sim_manager.run_cycle(walkers, n_steps, cycle_idx) # check to see if a checkpoint is necessary if (checkpoint_freq is not None): if (cycle_idx % checkpoint_freq == 0): # make the checkpoint snapshot checkpoint_snapshot = SimSnapshot(walkers, SimApparatus(filters)) # save the checkpoint (however that is implemented) self._save_checkpoint(start_hash, checkpoint_snapshot, configuration, checkpoint_dir, mode=dump_mode) cycle_idx += 1 # run the segment given the sim manager and run parameters end_snapshot = SimSnapshot(walkers, SimApparatus(filters)) # run the cleanup subroutine sim_manager.cleanup() # add the snapshot and the run for it end_hash = self.add_snapshot(end_snapshot) self.register_run(start_hash, end_hash, configuration) # clear the object variable for the current checkpoints del self._curr_run_checkpoints return start_hash, end_hash def orchestrate_snapshot_run_by_time(self, snapshot_hash, run_time, n_steps, checkpoint_freq=None, checkpoint_dir=None, orchestrator_path=None, configuration=None, # these can reparametrize the paths # for both the orchestrator produced # files as well as the configuration work_dir=None, config_name=None, narration=None, mode=None, # extra kwargs will be passed to the # configuration.reparametrize method **kwargs): # for writing the orchestration files we set the default mode # if mode is not given if mode is None: # the orchestrator mode is used for pickling the # orchestrator and so must be in bytes mode orch_mode = self.DEFAULT_ORCHESTRATION_MODE elif 'b' not in mode: # add a bytes to the end of the mode for the orchestrator pickleization orch_mode = mode + 'b' else: orch_mode = mode # there are two possible uses for the path reparametrizations: # the configuration and the orchestrator file paths. If both # of those are explicitly specified by passing in the whole # configuration object or both of checkpoint_dir, # orchestrator_path then those reparametrization kwargs will # not be used. As this is likely not the intention of the user # we will raise an error. If there is even one use for them no # error will be raised. # first check if any reparametrizations were even requested parametrizations_requested = (True if work_dir is not None else False, True if config_name is not None else False, True if narration is not None else False, True if mode is not None else False) # check if there are any available targets for reparametrization reparametrization_targets = (True if configuration is None else False, True if checkpoint_dir is None else False, True if orchestrator_path is None else False) # if paramatrizations were requested and there are no targets # we need to raise an error if any(parametrizations_requested) and not any(reparametrization_targets): raise OrchestratorError("Reparametrizations were requested but none are possible," " due to all possible targets being already explicitly given") # if any paths were not given and no defaults for path # parameters we want to fill in the defaults for them. This # will also fill in any missing parametrizations with defaults # we do this by just setting the path parameters if they # aren't set, then later the parametrization targets will be # tested for if they have been set or not, and if they haven't # then these will be used to generate paths for them. if work_dir is None: work_dir = self.DEFAULT_WORKDIR if config_name is None: config_name = self.DEFAULT_CONFIG_NAME if narration is None: narration = self.DEFAULT_NARRATION if mode is None: mode = self.DEFAULT_MODE # if no configuration was specified use the default one if configuration is None: configuration = self.default_configuration # reparametrize the configuration with the given path # parameters and anything else in kwargs. If they are none # this will have no effect anyhow configuration = configuration.reparametrize(work_dir=work_dir, config_name=config_name, narration=narration, mode=mode, **kwargs) # make parametric paths for the checkpoint directory and the # orchestrator pickle to be made, unless they are explicitly given if checkpoint_dir is None: # the checkpoint directory will be in the work dir checkpoint_dir = work_dir if orchestrator_path is None: # the orchestrator pickle will be of similar form to the # reporters having the config name, and narration if # given, and an identifying compound file extension orch_narration = "_{}".format(narration) if len(narration) > 0 else "" orch_filename = self.ORCH_FILENAME_TEMPLATE.format(config=config_name, narration=orch_narration) orchestrator_path = osp.join(work_dir, orch_filename) run_tup = self.run_snapshot_by_time(snapshot_hash, run_time, n_steps, checkpoint_freq=checkpoint_freq, checkpoint_dir=checkpoint_dir, configuration=configuration, mode=mode) # then serialize thineself self.dump(orchestrator_path, mode=orch_mode) return run_tup def orchestrate_run_by_time(self, init_walkers, run_time, n_steps, **kwargs): # make the starting snapshot from the walkers and the # apparatus if given, otherwise the default will be used start_hash = self.gen_start_snapshot(init_walkers) # orchestrate from the snapshot return self.orchestrate_snapshot_run_by_time(start_hash, run_time, n_steps, **kwargs) def run_continues(self, start_hash, end_hash): """Return the run_id that this run continues.""" # loop through the runs in this orchestrator until we find one # where the start_hash matches the end hash runs = self.runs run_idx = 0 while True: run_start_hash, run_end_hash = runs[run_idx] # if the start hash of the queried run is the same as the # end hash for this run we have found it if start_hash == run_end_hash: return (run_start_hash, run_end_hash) run_idx += 1 # if the index is over the number of runs we quit and # return None as no match if run_idx >= len(runs): return None def serialize_orchestrator(orchestrator): return orchestrator.serialize() def deserialize_orchestrator(serial_str): return Orchestrator.deserialize(serial_str) def dump_orchestrator(orchestrator, filepath, mode='wb'): orchestrator.dump(filepath, mode=mode) def load_orchestrator(filepath, mode='rb'): return Orchestrator.load(filepath, mode=mode) def encode(obj): return Orchestrator.encode(obj) def decode(encoded_str): return Orchestrator.decode(encoded_str) def reconcile_orchestrators(template_orchestrator, *orchestrators): # make a new orchestrator new_orch = Orchestrator(template_orchestrator.default_apparatus, default_init_walkers=template_orchestrator.default_init_walkers, default_configuration=template_orchestrator.default_configuration) # put the template back into the list of orchestrators orchestrators = (template_orchestrator, *orchestrators) for orch in orchestrators: # add in all snapshots from each orchestrator, by the hash not the # snapshots themselves for snaphash in orch.snapshot_hashes: snapshot = orch.get_snapshot(snaphash) new_orch._add_snapshot(snaphash, snapshot) # register all the runs in each for run in list(orch.runs): run_config = orch.run_configuration(*run) new_orch.register_run(*run, run_config) return new_orch def recover_run_by_time(start_orch, checkpoint_orch, run_time, n_steps, **kwargs): # reconcile the checkpoint orchestrator with the master the # original orchestrator, we put the original orch first so that it # preserves the defaults new_orch = reconcile_orchestrators(start_orch, checkpoint_orch) # now we need to get the hash of the checkpoint at the end of # the checkpoint orch to start from that, a checkpoint orch # should only have one run and the checkpoint will be the end # of that run. checkpoint_hash = checkpoint_orch.runs[0][-1] # then all we need to do is orchestrate from this checkpoint run_tup = new_orch.orchestrate_snapshot_run_by_time(checkpoint_hash, run_time, n_steps, **kwargs) return new_orch, run_tup if __name__ == "__main__": pass PK!wepy/reporter/__init__.pyPK!߇wepy/reporter/dashboard.pyimport logging from wepy.reporter.reporter import ProgressiveFileReporter class DashboardReporter(ProgressiveFileReporter): SUGGESTED_EXTENSION = "dash.org" def dashboard_string(self): raise NotImplementedError def write_dashboard(self): with open(self.file_path, mode=self.mode) as dashboard_file: dashboard_file.write(self.dashboard_string()) PK!k>>wepy/reporter/hdf5.pyfrom copy import deepcopy import logging import numpy as np from wepy.reporter.reporter import FileReporter from wepy.hdf5 import WepyHDF5 from wepy.util.util import json_top_atom_count class WepyHDF5Reporter(FileReporter): ALL_ATOMS_REP_KEY = 'all_atoms' SUGGESTED_EXTENSION = "wepy.h5" def __init__(self, save_fields=None, topology=None, units=None, sparse_fields=None, feature_shapes=None, feature_dtypes=None, n_dims=None, main_rep_idxs=None, all_atoms_rep_freq=None, # dictionary of alt_rep keys and a tuple of (idxs, freq) alt_reps=None, # pass in the resampler and boundary # conditions classes to automatically extract the # needed data, the objects themselves are not saves resampler=None, boundary_conditions=None, # or pass the things we need from them in manually resampling_fields=None, decision_enum_dict=None, resampler_fields=None, warping_fields=None, progress_fields=None, bc_fields=None, resampling_records=None, resampler_records=None, warping_records=None, bc_records=None, progress_records=None, **kwargs ): # initialize inherited attributes super().__init__(**kwargs) # do all the WepyHDF5 specific stuff self.wepy_run_idx = None self._tmp_topology = topology # which fields from the walker to save, if None then save all of them self.save_fields = save_fields # dictionary of sparse_field_name -> int : frequency of cycles # to save the field self._sparse_fields = sparse_fields self._feature_shapes = feature_shapes self._feature_dtypes = feature_dtypes self._n_dims = n_dims # get and set the record fields (naems, shapes, dtypes) for # the resampler and the boundary conditions if (resampling_fields is not None) and (decision_enum_dict is not None): self.resampling_fields = resampling_fields self.decision_enum = decision_enum_dict elif resampler is not None: self.resampling_fields = resampler.resampling_fields() self.decision_enum = resampler.DECISION.enum_dict_by_name() else: self.resampling_fields = None self.decision_enum = None if resampler_fields is not None: self.resampler_fields = resampler_fields() elif resampler is not None: self.resampler_fields = resampler.resampler_fields() else: self.resampler_fields = None if warping_fields is not None: self.warping_fields = warping_fields() elif boundary_conditions is not None: self.warping_fields = boundary_conditions.warping_fields() else: self.warping_fields = None if progress_fields is not None: self.progress_fields = progress_fields() elif boundary_conditions is not None: self.progress_fields = boundary_conditions.progress_fields() else: self.progress_fields = None if bc_fields is not None: self.bc_fields = bc_fields() elif boundary_conditions is not None: self.bc_fields = boundary_conditions.bc_fields() else: self.bc_fields = None # the fields which are records for table like reports if resampling_records is not None: self.resampling_records = resampling_records elif resampler is not None: self.resampling_records = resampler.resampling_record_field_names() else: self.resampling_records = None if resampler_records is not None: self.resampler_records = resampler_records elif resampler is not None: self.resampler_records = resampler.resampler_record_field_names() else: self.resampler_records = None if bc_records is not None: self.bc_records = bc_records elif boundary_conditions is not None: self.bc_records = boundary_conditions.bc_record_field_names() else: self.bc_records = None if warping_records is not None: self.warping_records = warping_records elif boundary_conditions is not None: self.warping_records = boundary_conditions.warping_record_field_names() else: self.warping_records = None if progress_records is not None: self.progress_records = progress_records elif boundary_conditions is not None: self.progress_records = boundary_conditions.progress_record_field_names() else: self.progress_records = None # the atom indices of the whole system that will be saved as # the main positions representation self.main_rep_idxs = main_rep_idxs # the idxs for alternate representations of the system # positions if alt_reps is not None: self.alt_reps_idxs = {key: list(tup[0]) for key, tup in alt_reps.items()} # add the frequencies for these alt_reps to the # sparse_fields frequency dictionary self._sparse_fields.update({"alt_reps/{}".format(key): tup[1] for key, tup in alt_reps.items()}) else: self.alt_reps_idxs = {} # check for alt_reps of this name because this is reserved for # the all_atoms flag. if self.ALL_ATOMS_REP_KEY in self.alt_reps_idxs: raise ValueError("Cannot name an alt_rep 'all_atoms'") # if there is a frequency for all atoms rep then we make an # alt_rep for the all_atoms system with the specified # frequency if all_atoms_rep_freq is not None: # count the number of atoms in the topology and set the # alt_reps to have the full slice for all atoms n_atoms = json_top_atom_count(self._tmp_topology) self.alt_reps_idxs[self.ALL_ATOMS_REP_KEY] = np.arange(n_atoms) # add the frequency for this sparse fields to the # sparse fields dictionary self._sparse_fields["alt_reps/{}".format(self.ALL_ATOMS_REP_KEY)] = all_atoms_rep_freq # if there are no sparse fields set it as an empty dictionary if self._sparse_fields is None: self._sparse_fields = {} # if units were given add them otherwise set as an empty dictionary if units is None: self.units = {} else: self.units = units def init(self, continue_run=None, **kwargs): # do the inherited stuff super().init(**kwargs) # open and initialize the HDF5 file self.wepy_h5 = WepyHDF5(self.file_path, mode=self.mode, topology=self._tmp_topology, units=self.units, sparse_fields=list(self._sparse_fields.keys()), feature_shapes=self._feature_shapes, feature_dtypes=self._feature_dtypes, n_dims=self._n_dims, main_rep_idxs=self.main_rep_idxs, alt_reps=self.alt_reps_idxs) with self.wepy_h5: # if this is a continuation run of another run we want to # initialize it as such # initialize a new run run_grp = self.wepy_h5.new_run(continue_run=continue_run) self.wepy_run_idx = run_grp.attrs['run_idx'] # initialize the run record groups using their fields self.wepy_h5.init_run_fields_resampling(self.wepy_run_idx, self.resampling_fields) # the enumeration for the values of resampling self.wepy_h5.init_run_fields_resampling_decision(self.wepy_run_idx, self.decision_enum) self.wepy_h5.init_run_fields_resampler(self.wepy_run_idx, self.resampler_fields) # set the fields that are records for tables etc. unless # they are already set if 'resampling' not in self.wepy_h5.record_fields: self.wepy_h5.init_record_fields('resampling', self.resampling_records) if 'resampler' not in self.wepy_h5.record_fields: self.wepy_h5.init_record_fields('resampler', self.resampler_records) # if there were no warping fields set there is no boundary # conditions and we don't initialize them if self.warping_fields is not None: self.wepy_h5.init_run_fields_warping(self.wepy_run_idx, self.warping_fields) self.wepy_h5.init_run_fields_progress(self.wepy_run_idx, self.progress_fields) self.wepy_h5.init_run_fields_bc(self.wepy_run_idx, self.bc_fields) # table records if 'warping' not in self.wepy_h5.record_fields: self.wepy_h5.init_record_fields('warping', self.warping_records) if 'boundary_conditions' not in self.wepy_h5.record_fields: self.wepy_h5.init_record_fields('boundary_conditions', self.bc_records) if 'progress' not in self.wepy_h5.record_fields: self.wepy_h5.init_record_fields('progress', self.progress_records) # if this was opened in a truncation mode, we don't want to # overwrite old runs with future calls to init(). so we # change the mode to read/write 'r+' if self.mode == 'w': self.set_mode(0, 'r+') def cleanup(self, **kwargs): # it should be already closed at this point but just in case if not self.wepy_h5.closed: self.wepy_h5.close() # remove reference to the WepyHDF5 file so we can serialize this object del self.wepy_h5 super().cleanup(**kwargs) def report(self, cycle_idx, walkers, warp_data, bc_data, progress_data, resampling_data, resampler_data, **kwargs): n_walkers = len(walkers) # determine which fields to save. If there were none specified # save all of them if self.save_fields is None: save_fields = list(walkers[0].state.dict().keys()) else: save_fields = self.save_fields with self.wepy_h5: # add trajectory data for the walkers for walker_idx, walker in enumerate(walkers): walker = deepcopy(walker) walker_data = walker.state.dict() # iterate through the feature vectors of the walker # (fields), and the keys for the alt_reps for field_path in list(walker_data.keys()): # save the field if it is in the list of save_fields if field_path not in save_fields: walker_data.pop(field_path) continue # if the result is None don't save anything if walker_data[field_path] is None: walker_data.pop(field_path) continue # if this is a sparse field we decide # whether it is a valid cycle to save on if field_path in self._sparse_fields: if cycle_idx % self._sparse_fields[field_path] != 0: # this is not a valid cycle so we # remove from the walker_data walker_data.pop(field_path) continue # Add the alt_reps fields by slicing the positions for alt_rep_key, alt_rep_idxs in self.alt_reps_idxs.items(): alt_rep_path = "alt_reps/{}".format(alt_rep_key) # check to make sure this is a cycle this is to be # saved to, if it is add it to the walker_data if cycle_idx % self._sparse_fields[alt_rep_path] == 0: # if the idxs are None we want all of the atoms if alt_rep_idxs is None: alt_rep_data = walker_data['positions'][:] # otherwise get only th atoms we want else: alt_rep_data = walker_data['positions'][alt_rep_idxs] walker_data[alt_rep_path] = alt_rep_data # lastly reduce the atoms for the main representation # if this option was given if self.main_rep_idxs is not None: walker_data['positions'] = walker_data['positions'][self.main_rep_idxs] # for all of these fields we wrap them in another # dimension to make them feature vectors for field_path in list(walker_data.keys()): walker_data[field_path] = np.array([walker_data[field_path]]) # save the data to the HDF5 file for this walker # check to see if the walker has a trajectory in the run if walker_idx in self.wepy_h5.run_traj_idxs(self.wepy_run_idx): # if it does then append to the trajectory self.wepy_h5.extend_traj(self.wepy_run_idx, walker_idx, weights=np.array([[walker.weight]]), data=walker_data) # start a new trajectory else: # add the traj for the walker with the data traj_grp = self.wepy_h5.add_traj(self.wepy_run_idx, weights=np.array([[walker.weight]]), data=walker_data) # add as metadata the cycle idx where this walker started traj_grp.attrs['starting_cycle_idx'] = cycle_idx # report the boundary conditions records data, if boundary # conditions were initialized if self.warping_fields is not None: self.report_warping(cycle_idx, warp_data) self.report_bc(cycle_idx, bc_data) self.report_progress(cycle_idx, progress_data) # report the resampling records data self.report_resampling(cycle_idx, resampling_data) self.report_resampler(cycle_idx, resampler_data) super().report(**kwargs) # sporadic def report_warping(self, cycle_idx, warping_data): if len(warping_data) > 0: self.wepy_h5.extend_cycle_warping_records(self.wepy_run_idx, cycle_idx, warping_data) def report_bc(self, cycle_idx, bc_data): if len(bc_data) > 0: self.wepy_h5.extend_cycle_bc_records(self.wepy_run_idx, cycle_idx, bc_data) def report_resampler(self, cycle_idx, resampler_data): if len(resampler_data) > 0: self.wepy_h5.extend_cycle_resampler_records(self.wepy_run_idx, cycle_idx, resampler_data) # the resampling records are provided every cycle but they need to # be saved as sporadic because of the variable number of walkers def report_resampling(self, cycle_idx, resampling_data): self.wepy_h5.extend_cycle_resampling_records(self.wepy_run_idx, cycle_idx, resampling_data) # continual def report_progress(self, cycle_idx, progress_data): self.wepy_h5.extend_cycle_progress_records(self.wepy_run_idx, cycle_idx, [progress_data]) PK!mqwepy/reporter/reporter.pyimport os import os.path as osp import pickle import logging class ReporterError(Exception): pass class Reporter(object): def __init__(self, **kwargs): pass def init(self, **kwargs): method_name = 'init' assert not hasattr(super(), method_name), \ "Superclass with method {} is masked".format(method_name) def report(self, **kwargs): method_name = 'report' assert not hasattr(super(), method_name), \ "Superclass with method {} is masked".format(method_name) def cleanup(self, **kwargs): method_name = 'cleanup' assert not hasattr(super(), method_name), \ "Superclass with method {} is masked".format(method_name) class FileReporter(Reporter): MODES = ('x', 'w', 'w-', 'r', 'r+',) DEFAULT_MODE = 'x' # these are keywords that can be recognized by subclasses of # FileReporter in kwargs in order to bypass path methods, in order # to always support a direct file_path setting method. For example # in the ParametrizableFileReporter if you don't want to set the # parametrizable things then you just pass in one of the bypass # keywords and it will skip its generation of the file_paths # through components BYPASS_KEYWORDS = ('file_path', 'file_paths',) SUGGESTED_FILENAME_TEMPLATE = "{config}{narration}.{ext}" SUGGESTED_EXTENSION = 'report' def __init__(self, file_paths=None, modes=None, file_path=None, mode=None, **kwargs): # file paths assert not ((file_paths is not None) and (file_path is not None)), \ "only file_paths or file_path kwargs can be specified" # if only one file path is given then we handle it as multiple if file_path is not None: file_paths = [file_path] self._file_paths = file_paths # modes assert not ((modes is not None) and (mode is not None)), \ "only modes or mode kwargs can be specified" # if modes is None we make modes, from defaults if we have to if modes is None: # if mode is None set it to the default if modes is None and mode is None: mode = self.DEFAULT_MODE # if only one mode is given copy it for each file given modes = [mode for i in range(len(self._file_paths))] self._modes = modes super().__init__(**kwargs) def _bypass_dispatch(self, **kwargs): # check if we are bypassing the parametrization for # compatibility if any([True if key in self.BYPASS_KEYWORDS else False for key in kwargs.keys()]): # we just call the superclass methods then FileReporter.__init__(self, **kwargs) # unfortunately without doing metaclass weird stuff the # returned object will be an unparametrizable # ParamatrizableFileReporter but I think its okay for the # use cases it will be used for return True else: return False def _validate_mode(self, mode): if mode in self.MODES: return True else: return False @property def mode(self): if len(self._file_paths) > 1: raise ReporterError("there are multiple files and modes defined") return self._modes[0] @property def file_path(self): if len(self._file_paths) > 1: raise ReporterError("there are multiple files and modes defined") return self._file_paths[0] @property def file_paths(self): return self._file_paths @file_paths.setter def file_paths(self, file_paths): for i, file_path in enumerate(file_paths): self.set_path(i, file_path) def set_path(self, file_idx, path): self._paths[file_idx] = path @property def modes(self): return self._modes @modes.setter def modes(self, modes): for i, mode in enumerate(modes): self.set_mode(i, mode) def set_mode(self, file_idx, mode): if self._validate_mode(mode): self._modes[file_idx] = mode else: raise ValueError("Incorrect mode {}".format(mode)) def reparametrize(self, file_paths, modes): self.file_paths = file_paths self.modes = modes class ParametrizableFileReporter(FileReporter): PATH_TEMPLATE = "{work_dir}/{root_name}{suffix}.{extensions}" SUFFIX_TEMPLATE = "_{}" def __init__(self, root_names=None, work_dir=None, suffix=None, extensions=None, **kwargs): # we check if we should bypass this class initialization and # just do the base stuff if self._bypass_dispatch(**kwargs): return None # otherwise we do the parametrization self._root_names = root_names if work_dir is None: self._work_dir = osp.realpath(osp.curdir) else: self._work_dir = work_dir if suffix is None: self._suffix = "" else: self._suffix = self.SUFFIX_TEMPLATE.format(suffix) self._extensions = extensions file_paths = [] for file_idx, root_name in enumerate(self._root_names): # construct a path for this reporter file_path = self.PATH_TEMPLATE.format(root_name=root_name, work_dir=osp.realpath(self._work_dir), suffix=self._suffix, extensions=self._extensions[file_idx].strip('.') ) file_paths.append(file_path) super().__init__(file_paths=file_paths, **kwargs) @property def root_names(self): return self._root_names @property def work_dir(self): return self._work_dir @property def suffix(self): return self._suffix @property def extensions(self): return self._extensions class ProgressiveFileReporter(FileReporter): """Super class for a reporter that will successively overwrite the same file over and over again. The base FileReporter really only supports creation of file one time. """ def init(self, *args, **kwargs): super().init(**kwargs) # because we want to overwrite the file at every cycle we # need to change the modes to write with truncate. This allows # the file to first be opened in 'x' or 'w-' and check whether # the file already exists (say from another run), and warn the # user. However, once the file has been created for this run # we need to overwrite it many times forcefully. # go thourgh each file managed by this reporter for file_i, mode in enumerate(self.modes): # if the mode is 'x' or 'w-' we check to make sure the file # doesn't exist if self.mode in ['x', 'w-']: file_path = self.file_paths[file_i] if osp.exists(file_path): raise FileExistsError("File exists: '{}'".format(file_path)) # now that we have checked if the file exists we set it into # overwrite mode self.set_mode(file_i, 'w') PK!9?kwepy/reporter/restart.pyimport os.path as osp from copy import deepcopy import pickle import logging from wepy.reporter.reporter import FileReporter from wepy.sim_manager import Manager class RestartReporter(FileReporter): def __init__(self, file_path, mode='x'): super().__init__(file_path, mode=mode) def init(self, *args, runner=None, resampler=None, boundary_conditions=None, work_mapper=None, reporters=None, **kwargs): # get a reference for each the parts self.runner = runner self.resampler = resampler self.boundary_conditions = boundary_conditions self.reporters = reporters def cleanup(self, *args, work_mapper=None, **kwargs): # before calling this be sure that the other reporters have # been cleaned up or at least be sure they don't contain # anything that will cause errors on deepcopying or pickle self.work_mapper = work_mapper # copy this reporter self_copy = deepcopy(self) # save this object as a pickle, open it in the mode with open(self.file_path, mode=self.mode+"b") as wf: pickle.dump(self_copy, wf) def report(self, cycle_idx, new_walkers, *args, resampled_walkers=None, **kwargs): self.cycle_idx = cycle_idx # walkers after the sampling segment self.new_walkers = new_walkers # resampled walkers which would be used if the run was # continued (what we are preparing for here) self.restart_walkers = resampled_walkers def new_sim_manager(self, file_report_suffix=None, reporter_base_path=None): """Generate a simulation manager from the objects in this restarter. All objects are deepcopied so that this restarter object can be used multiple times, without reading from disk again. If a `file_report_suffix` string is given all reporters inheriting from FileReporter will have their `file_path` attribute modified. `file_report_suffix` will be appended to the first clause (clauses here are taken to be portions of the file name separated by '.'s) so that for 'file.txt' the substring 'file' will be replaced by 'file{}'. Where the suffix is formatted into that string. """ # check the arguments for correctness if (file_report_suffix is not None) or \ (reporter_base_path is not None): # if just the base path is given if file_report_suffix is None: assert type(reporter_base_path) is str, \ "'reporter_base_path' must be a string, given {}".format(type(reporter_base_path)) if reporter_base_path is None: assert type(file_report_suffix) is str, \ "'file_report_suffix' must be a string, given {}".format(type(file_report_suffix)) # copy the reporters from this objects list of reporters, we # also need to replace, the restart reporter in that list with # this one. reporters = [] for reporter in self.reporters: # if the reporter is a restart reporter we replace it with # a copy of this reporter, it will be mutated later # potentially to change the path to save the pickle if isinstance(reporter, RestartReporter): reporters.append(deepcopy(self)) else: # otherwise make a copy of the reporter reporters.append(deepcopy(reporter)) # copy all of the other objects before construction restart_walkers = deepcopy(self.restart_walkers) runner = deepcopy(self.runner) resampler = deepcopy(self.resampler) boundary_conditions = deepcopy(self.boundary_conditions) work_mapper = deepcopy(self.work_mapper) # modify them if this was specified # update the FileReporter paths # iterate through the reporters and add the suffix to the # FileReporter subclasses for reporter in reporters: # check if the reporter class is a FileReporter subclass if issubclass(type(reporter), FileReporter): # because we are making a new file with any change, we # need to modify the access mode to a conservative # creation mode reporter.mode = 'x' if file_report_suffix is not None: filename = osp.basename(reporter.file_path) # get the clauses clauses = filename.split('.') # make a template out of the first clause template_clause = clauses[0] + "{}" # fill in the suffix to the template clause mod_clause = template_clause.format(file_report_suffix) # combine them back into a filename new_filename = '.'.join([mod_clause, *clauses[1:]]) else: # if we don't have a suffix just return the original name filename = osp.basename(reporter.file_path) new_filename = filename # if a new base path was given make that the path # to the filename if reporter_base_path is not None: new_path = osp.join(reporter_base_path, new_filename) # if it wasn't given, add the rest of the original path back else: new_path = osp.join(osp.dirname(reporter.file_path), new_filename) # make a copy of the reporter and pass this to the # sim manager instead of the one in the object new_reporter = reporter new_reporter.file_path = new_path # construct the sim manager sim_manager = Manager(restart_walkers, runner=runner, resampler=resampler, boundary_conditions=boundary_conditions, work_mapper=work_mapper, reporters=reporters) return sim_manager PK!MԲ  wepy/reporter/setup.pyfrom copy import deepcopy import pickle import logging from wepy.reporter.reporter import FileReporter class SetupReporter(FileReporter): def __init__(self, file_path, mode='x'): super().__init__(file_path, mode=mode) def init(self, *args, resampler=None, boundary_conditions=None, init_walkers=None, **kwargs): self.init_walkers = init_walkers self.resampler = resampler self.boundary_conditions = boundary_conditions # copy this reporter with as well as all of the objects it contains self_copy = deepcopy(self) # save this object as a pickle, open it in the mode with open(self.file_path, mode=self.mode+"b") as wf: pickle.dump(self_copy, wf) PK!"wepy/reporter/wexplore/__init__.pyPK!ﱃJJ#wepy/reporter/wexplore/dashboard.pyimport os.path as osp from collections import defaultdict import itertools as it import logging from wepy.reporter.dashboard import DashboardReporter import numpy as np import pandas as pd class WExploreDashboardReporter(DashboardReporter): SUGGESTED_EXTENSION = "wexplore.dash.org" DASHBOARD_TEMPLATE = \ """* Weighted Ensemble Simulation Integration Step Size: {step_time} seconds {step_time_femtoseconds} femtoseconds Last Cycle Index: {last_cycle_idx} Number of Cycles: {n_cycles} Single Walker Sampling Time: {walker_total_sampling_time} seconds {walker_total_sampling_time_microseconds} microseconds Total Sampling Time: {total_sampling_time} seconds {total_sampling_time_microseconds} microseconds * WExplore Max Number of Regions: {max_n_regions} Max Region Sizes: {max_region_sizes} Number of Regions per level: {regions_per_level} ** Region Hierarchy Defined Regions with the number of child regions per parent region: {region_hierarchy} ** WExplore Log {wexplore_log} * Walker Table {walker_table} * Leaf Region Table {leaf_region_table} * Warping through boundary conditions Cutoff Distance: {cutoff_distance} Number of Exit Points this Cycle: {cycle_n_exit_points} Total Number of Exit Points: {n_exit_points} Cumulative Unbound Weight {total_unbound_weight} Expected Reactive Traj. Time: {expected_unbinding_time} seconds Expected Reactive Traj. Rate: {reactive_traj_rate} 1/seconds Rate: {exit_rate} 1/seconds ** Warping Log {warping_log} * Performance Average Runner Time: {avg_runner_time} Average Boundary Conditions Time: {avg_bc_time} Average Resampling Time: {avg_resampling_time} Average Cycle Time: {avg_cycle_time} Worker Avg. Segment Times: {worker_avg_segment_time} ** Cycle Performance Log {cycle_log} ** Worker Performance Log {performance_log} """ def __init__(self, step_time=None, # seconds max_n_regions=None, max_region_sizes=None, bc_cutoff_distance=None, **kwargs ): super().__init__(**kwargs) assert step_time is not None, "length of integration time step must be given" self.step_time = step_time assert max_n_regions is not None, "number of regions per level for WExplore must be given" self.max_n_regions = max_n_regions assert max_region_sizes is not None, "region sizes for WExplore must be given" self.max_region_sizes = max_region_sizes self.n_levels = len(self.max_n_regions) assert bc_cutoff_distance is not None, "cutoff distance for the boundary conditions must be given" self.bc_cutoff_distance = bc_cutoff_distance ## recalculated values # weighted ensemble self.walker_weights = [] self.last_cycle_idx = 0 self.n_cycles = 0 self.walker_total_sampling_time = 0.0 # seconds self.total_sampling_time = 0.0 # seconds # warps self.n_exit_points = 0 self.cycle_n_exit_points = 0 self.total_unbound_weight = 0.0 self.exit_rate = np.inf # 1 / seconds self.expected_unbinding_time = np.inf # seconds self.reactive_traj_rate = 0.0 # 1 / seconds # progress self.walker_distance_to_prot = [] # nanometers # WExplore # resampler self.root_region = () init_leaf_region = tuple([0 for i in range(self.n_levels)]) self.region_ids = [init_leaf_region] self.regions_per_level = [] self.children_per_region = {} # resampling self.walker_assignments = [] self.walker_image_distances = [] self.curr_region_probabilities = defaultdict(int) self.curr_region_counts = defaultdict(int) # performance self.avg_cycle_time = np.nan self.avg_runner_time = np.nan self.avg_bc_time = np.nan self.avg_resampling_time = np.nan self.worker_agg_table = None ## Log of events variables # boundary conditions self.exit_point_weights = [] self.exit_point_times = [] self.warp_records = [] # wexplore self.branch_records = [] # performance self.cycle_compute_times = [] self.cycle_runner_times = [] self.cycle_bc_times = [] self.cycle_resampling_times = [] self.worker_records = [] def report(self, cycle_idx, walkers, warp_data, bc_data, progress_data, resampling_data, resampler_data, n_steps=None, worker_segment_times=None, cycle_runner_time=None, cycle_bc_time=None, cycle_resampling_time=None, *args, **kwargs): # first recalculate the total sampling time, update the # number of cycles, and set the walker probabilities self.update_weighted_ensemble_values(cycle_idx, n_steps, walkers) # if there were any warps we need to set new values for the # warp variables and add records self.update_warp_values(cycle_idx, warp_data) # update progress towards the boundary conditions self.update_progress_values(cycle_idx, progress_data) # now we update the WExplore values self.update_wexplore_values(cycle_idx, resampling_data, resampler_data) # update the performance of the workers for our simulation self.update_performance_values(cycle_idx, n_steps, worker_segment_times, cycle_runner_time, cycle_bc_time, cycle_resampling_time) # write the dashboard self.write_dashboard() def update_weighted_ensemble_values(self, cycle_idx, n_steps, walkers): # the number of cycles self.last_cycle_idx = cycle_idx self.n_cycles += 1 # amount of new sampling time for each walker new_walker_sampling_time = self.step_time * n_steps # accumulated sampling time for a single walker self.walker_total_sampling_time += new_walker_sampling_time # amount of sampling time for all walkers new_sampling_time = new_walker_sampling_time * len(walkers) # accumulated sampling time for the ensemble self.total_sampling_time += new_sampling_time # the weights of the walkers self.walker_weights = [walker.weight for walker in walkers] def update_warp_values(self, cycle_idx, warp_data): self.cycle_n_exit_points = 0 for warp_record in warp_data: weight = warp_record['weight'][0] walker_idx = warp_record['walker_idx'][0] record = (walker_idx, weight, cycle_idx, self.walker_total_sampling_time) self.warp_records.append(record) # also add them to the individual records self.exit_point_weights.append(weight) self.exit_point_times.append(self.walker_total_sampling_time) # increase the number of exit points by 1 self.n_exit_points += 1 self.cycle_n_exit_points += 1 # total accumulated unbound probability self.total_unbound_weight += weight # calculate the new rate using the Hill relation after taking # into account all of these warps self.exit_rate = self.total_unbound_weight / self.total_sampling_time # calculate the expected value of unbinding times self.expected_unbinding_time = np.sum([self.exit_point_weights[i] * self.exit_point_times[i] for i in range(self.n_exit_points)]) # expected rate of reactive trajectories self.reactive_traj_rate = 1 / self.expected_unbinding_time def update_progress_values(self, cycle_idx, progress_data): self.walker_distance_to_prot = tuple(progress_data['min_distances']) def update_wexplore_values(self, cycle_idx, resampling_data, resampler_data): # the region assignments for walkers assignments = [] # re-initialize the current weights dictionary self.curr_region_probabilities = defaultdict(int) self.curr_region_counts = defaultdict(int) for walker_record in resampling_data: assignment = tuple(walker_record['region_assignment']) walker_idx = walker_record['walker_idx'][0] assignments.append((walker_idx, assignment)) # calculate the probabilities and counts of the regions # given the current distribution of walkers self.curr_region_probabilities[assignment] += self.walker_weights[walker_idx] self.curr_region_counts[assignment] += 1 # sort them to get the walker indices in the right order assignments.sort() # then just get the assignment since it is sorted self.walker_assignments = [assignment for walker, assignment in assignments] # add to the records for region creation in WExplore for resampler_record in resampler_data: # get the values new_leaf_id = tuple(resampler_record['new_leaf_id']) branching_level = resampler_record['branching_level'][0] walker_image_distance = resampler_record['distance'][0] # add the new leaf id to the list of regions in the order they were created self.region_ids.append(new_leaf_id) # make a new record for a branching event which is: # (region_id, level branching occurred, distance of walker that triggered the branching) branch_record = (new_leaf_id, branching_level, walker_image_distance) # save it in the records self.branch_records.append(branch_record) # count the number of child regions each region has self.children_per_region = {} all_regions = self.leaf_regions_to_all_regions(self.region_ids) for region_id in all_regions: # if its a leaf region it has no children if len(region_id) == self.n_levels: self.children_per_region[region_id] = 0 # all others we cound how many children it has else: # get all regions that have this one as a root children_idxs = set() for poss_child_id in all_regions: # get the root at the level of this region for the child poss_child_root = poss_child_id[0:len(region_id)] # if the root is the same we keep it without # counting children below the next level, but we skip the same region if (poss_child_root == region_id) and (poss_child_id != region_id): try: child_idx = poss_child_id[len(region_id)] except IndexError: import ipdb; ipdb.set_trace() children_idxs.add(child_idx) # count the children of this region self.children_per_region[region_id] = len(children_idxs) # count the number of regions at each level self.regions_per_level = [0 for i in range(self.n_levels)] for region_id, n_children in self.children_per_region.items(): level = len(region_id) # skip the leaves if level == self.n_levels: continue self.regions_per_level[level] += n_children def update_performance_values(self, cycle_idx, n_steps, worker_segment_times, cycle_runner_time, cycle_bc_time, cycle_resampling_time): ## worker specific performance # only do this part if there were any workers if len(worker_segment_times) > 0: # log of segment times for workers for worker_idx, segment_times in worker_segment_times.items(): for segment_time in segment_times: record = (cycle_idx, n_steps, worker_idx, segment_time) self.worker_records.append(record) # make a table out of these and compute the averages for each # worker worker_df = pd.DataFrame(self.worker_records, columns=('cycle_idx', 'n_steps', 'worker_idx', 'segment_time')) # the aggregated table for the workers self.worker_agg_table = worker_df.groupby('worker_idx')[['segment_time']].aggregate(np.mean) self.worker_agg_table.rename(columns={'segment_time' : 'avg_segment_time (s)'}, inplace=True) else: self.worker_records = [] self.worker_agg_table = pd.DataFrame({'avg_segment_time (s)' : []}) ## cycle times # log of the components times self.cycle_runner_times.append(cycle_runner_time) self.cycle_bc_times.append(cycle_bc_time) self.cycle_resampling_times.append(cycle_resampling_time) # add up the three components to get the overall cycle time cycle_time = cycle_runner_time + cycle_bc_time + cycle_resampling_time # log of cycle times self.cycle_compute_times.append(cycle_time) # average of cycle components times self.avg_runner_time = np.mean(self.cycle_runner_times) self.avg_bc_time = np.mean(self.cycle_bc_times) self.avg_resampling_time = np.mean(self.cycle_resampling_times) # average cycle time self.avg_cycle_time = np.mean(self.cycle_compute_times) def leaf_regions_to_all_regions(self, region_ids): # make a set of all the regions starting with the root region regions = set([self.root_region]) for region_id in region_ids: for i in range(len(region_id)): regions.add(region_id[0:i+1]) regions = list(regions) regions.sort() return regions def dashboard_string(self): regions = self.leaf_regions_to_all_regions(self.region_ids) region_children = [self.children_per_region[region] for region in regions] region_children_pairs = it.chain(*zip(regions, region_children)) region_hierarchy = '\n'.join(['{} {}' for i in range(len(regions))]).format(*region_children_pairs) # make the table of walkers using pandas, using the order here # TODO add the image distances walker_table_colnames = ('weight', 'assignment', 'progress') #'image_distances' walker_table_d = {} walker_table_d['weight'] = self.walker_weights walker_table_d['assignment'] = self.walker_assignments walker_table_d['progress'] = self.walker_distance_to_prot walker_table_df = pd.DataFrame(walker_table_d, columns=walker_table_colnames) walker_table_str = walker_table_df.to_string() # make a table for the regions region_table_colnames = ('region', 'n_walkers', 'curr_weight') region_table_d = {} region_table_d['region'] = self.region_ids region_table_d['n_walkers'] = [self.curr_region_counts[region] for region in self.region_ids] region_table_d['curr_weight'] = [self.curr_region_probabilities[region] for region in self.region_ids] leaf_region_table_df = pd.DataFrame(region_table_d, columns=region_table_colnames) leaf_region_table_df.set_index('region', drop=True) leaf_region_table_str = leaf_region_table_df.to_string() # table for aggregeated worker stats worker_agg_table_str = self.worker_agg_table.to_string() # log of branching events branching_table_colnames = ('new_leaf_id', 'branching_level', 'trigger_distance') branching_table_df = pd.DataFrame(self.branch_records, columns=branching_table_colnames) branching_table_str = branching_table_df.to_string() # log of warp events warp_table_colnames = ('walker_idx', 'weight', 'cycle_idx', 'time (s)') warp_table_df = pd.DataFrame(self.warp_records, columns=warp_table_colnames) warp_table_str = warp_table_df.to_string() # log of cycle times cycle_table_colnames = ('cycle_time (s)', 'runner_time (s)', 'boundary_conditions_time (s)', 'resampling_time (s)') cycle_table_df = pd.DataFrame({'cycle_times' : self.cycle_compute_times, 'runner_time' : self.cycle_runner_times, 'boundary_conditions_time' : self.cycle_bc_times, 'resampling_time' : self.cycle_resampling_times}, columns=cycle_table_colnames) cycle_table_str = cycle_table_df.to_string() # log of workers performance worker_table_colnames = ('cycle_idx', 'n_steps', 'worker_idx', 'segment_time (s)',) worker_table_df = pd.DataFrame(self.worker_records, columns=worker_table_colnames) worker_table_str = worker_table_df.to_string() # format the dashboard string dashboard = self.DASHBOARD_TEMPLATE.format( step_time=self.step_time, step_time_femtoseconds=self.step_time * 10e15, last_cycle_idx=self.last_cycle_idx, n_cycles=self.n_cycles, walker_total_sampling_time=self.walker_total_sampling_time, walker_total_sampling_time_microseconds=self.walker_total_sampling_time * 10e6, total_sampling_time=self.total_sampling_time, total_sampling_time_microseconds=self.total_sampling_time * 10e6, cutoff_distance=self.bc_cutoff_distance, n_exit_points=self.n_exit_points, cycle_n_exit_points=self.cycle_n_exit_points, total_unbound_weight=self.total_unbound_weight, expected_unbinding_time=self.expected_unbinding_time, reactive_traj_rate=self.reactive_traj_rate, exit_rate=self.exit_rate, walker_distance_to_prot=self.walker_distance_to_prot, max_n_regions=self.max_n_regions, max_region_sizes=self.max_region_sizes, regions_per_level=self.regions_per_level, region_hierarchy=region_hierarchy, avg_runner_time=self.avg_runner_time, avg_bc_time=self.avg_bc_time, avg_resampling_time=self.avg_resampling_time, avg_cycle_time=self.avg_cycle_time, worker_avg_segment_time=worker_agg_table_str, walker_table=walker_table_str, leaf_region_table=leaf_region_table_str, warping_log=warp_table_str, wexplore_log=branching_table_str, cycle_log=cycle_table_str, performance_log=worker_table_str, ) return dashboard PK!wepy/resampling/__init__.pyPK!%wepy/resampling/decisions/__init__.pyPK!  (wepy/resampling/decisions/clone_merge.pyfrom collections import namedtuple, defaultdict from enum import Enum import logging import numpy as np from wepy.resampling.decisions.decision import Decision from wepy.walker import split, keep_merge # the possible types of decisions that can be made enumerated for # storage, these each correspond to specific instruction type class CloneMergeDecisionEnum(Enum): NOTHING = 1 CLONE = 2 SQUASH = 3 KEEP_MERGE = 4 class MultiCloneMergeDecision(Decision): ENUM = CloneMergeDecisionEnum FIELDS = ('decision_id', 'target_idxs',) SHAPES = ((1,), Ellipsis,) DTYPES = (np.int, np.int,) RECORD_FIELDS = ('decision_id', 'target_idxs') # the decision types that pass on their state ANCESTOR_DECISION_IDS = (ENUM.NOTHING.value, ENUM.KEEP_MERGE.value, ENUM.CLONE.value,) @classmethod def field_names(cls): return cls.FIELDS @classmethod def field_shapes(cls): return cls.SHAPES @classmethod def field_dtypes(cls): return cls.DTYPES @classmethod def fields(cls): return list(zip(cls.field_names(), cls.field_shapes(), cls.field_dtypes())) @classmethod def record_field_names(cls): return self.RECORD_FIELDS @classmethod def record(cls, enum_value, target_idxs): record = super().record(enum_value) record['target_idxs'] = target_idxs return record @classmethod def action(cls, walkers, decisions): """Performs cloning and merging according to a list of resampling records for some walkers.""" # list for the modified walkers mod_walkers = [None for i in range(len(walkers))] # perform clones and merges for each step of resampling for step_idx, step_recs in enumerate(decisions): # we need to collect groups of merges, one entry for each # merge, where the key is the walker_idx of the keep merge slot squash_walkers = defaultdict(list) keep_walkers = {} # go through each decision and perform the decision # instructions for walker_idx, walker_rec in enumerate(step_recs): decision_value = walker_rec['decision_id'] instruction = walker_rec['target_idxs'] if decision_value == cls.ENUM.NOTHING.value: # check to make sure a walker doesn't already exist # where you are going to put it if mod_walkers[instruction[0]] is not None: raise ValueError( "Multiple walkers assigned to position {}".format(instruction[0])) # put the walker in the position specified by the # instruction mod_walkers[instruction[0]] = walkers[walker_idx] # for a clone elif decision_value == cls.ENUM.CLONE.value: # get the walker to be cloned walker = walkers[walker_idx] # "clone" it by splitting it into walkers of the # same state with even weights clones = split(walker, number=len(instruction)) # then assign each of these clones to a target # walker index in the next step for clone_idx, target_idx in enumerate(instruction): # check that there are not another walker # already assigned to this position if mod_walkers[target_idx] is not None: raise ValueError( "Multiple walkers assigned to position {}".format(instruction[0])) # TODO this comment was just fixed so I # believe that there was some serious problems # before # mod_walkers[walker_idx] = clones[clone_idx] # assign the clone to the modified walkers of the next step mod_walkers[target_idx] = clones[clone_idx] # if it is a decision for merging we must perform this # once we know all the merge targets for each merge group elif decision_value == cls.ENUM.SQUASH.value: # save this walker to the appropriate merge group to # merge after going through the list of walkers squash_walkers[instruction[0]].append(walker_idx) elif decision_value == cls.ENUM.KEEP_MERGE.value: keep_walkers[instruction[0]] = walker_idx else: raise ValueError("Decision not recognized") # do the merging for each merge group for target_idx, walker_idxs in squash_walkers.items(): keep_idx = keep_walkers[target_idx] # collect the walkers in the merge group, the keep idx is # always the first in the list merge_grp = [walkers[keep_idx]] + [walkers[i] for i in walker_idxs] # merge the walkers merged_walker = keep_merge(merge_grp, 0) # make sure there is not already a walker in this slot if mod_walkers[target_idx] is not None: raise ValueError( "Multiple walkers assigned to position {}".format(target_idx)) # set it in the slot for the keep_idx mod_walkers[keep_idx] = merged_walker if not all([False if walker is None else True for walker in mod_walkers]): raise ValueError("Some walkers were not created") return mod_walkers PK!%Ip%wepy/resampling/decisions/decision.pyfrom collections import namedtuple from enum import Enum from string import ascii_lowercase import logging import numpy as np # ABC for the Decision class class Decision(object): ENUM = None FIELDS = ('decision_id') # suggestion for subclassing # FIELDS = super().FIELDS + ('target_idxs',) # etc. # An Ellipsis instead of fields indicate there is a variable # number of fields. SHAPES = ((1,),) DTYPES = (np.int,) @classmethod def enum_dict_by_name(cls): if cls.ENUM is None: raise NotImplementedError d = {} for enum in cls.ENUM: d[enum.name] = enum.value return d @classmethod def enum_dict_by_value(cls): if cls.ENUM is None: raise NotImplementedError d = {} for enum in cls.ENUM: d[enum.value] = enum return d @classmethod def enum_by_value(cls, enum_value): d = cls.enum_dict_by_value() return d[enum_value] @classmethod def enum_by_name(cls, enum_name): d = cls.enum_dict_by_name() return d[enum_name] @classmethod def record(cls, enum_value): # TODO check to make sure the enum value is valid return {'decision_id' : enum_value} @classmethod def action(cls, walkers, decisions): """Perform the instructions for a set of resampling records on walkers.""" raise NotImplementedError @classmethod def parents(cls, step): """Given a row of resampling records (for a single resampling step) returns the parents of the children of this step.""" # initialize a list for the parents of this stages walkers step_parents = [None for i in range(len(step))] # the rest of the stages parents are based on the previous stage for parent_idx, parent_rec in enumerate(step): # if the decision is an ancestor then the instruction # values will be the children if parent_rec[0] in cls.ANCESTOR_DECISION_IDS: # the first value of the parent record is the target # idxs child_idxs = parent_rec[1] for child_idx in child_idxs: step_parents[child_idx] = parent_idx return step_parents class NothingDecisionEnum(Enum): NOTHING = 0 # an example of a Decision class that has the enumeration, instruction # record namedtuple, and the instruction dtypes class NoDecision(Decision): ENUM = NothingDecisionEnum INSTRUCTION_NAMES = ( (ENUM.NOTHING, "NothingInstructionRecord"), ) INSTRUCTION_FIELDS = ( (ENUM.NOTHING, ('pos',)),) INSTRUCTION_FIELD_DTYPES = ( (ENUM.NOTHING, (np.int,)), ) # the decision types that pass on their state ANCESTOR_DECISION_IDS = (ENUM.NOTHING.value,) @classmethod def action(cls, walkers, decisions): # list for the modified walkers mod_walkers = [None for i in range(len(walkers))] # go through each decision and perform the decision # instructions for walker_idx, decision in enumerate(decisions): decision_value, instruction = decision if decision_value == cls.ENUM.NOTHING.value: # check to make sure a walker doesn't already exist # where you are going to put it if mod_walkers[instruction[0]] is not None: raise ValueError( "Multiple walkers assigned to position {}".format(instruction[0])) # put the walker in the position specified by the # instruction mod_walkers[instruction[0]] = walkers[walker_idx] return mod_walkers PK!%wepy/resampling/distances/__init__.pyPK!+f))%wepy/resampling/distances/distance.pyimport logging class Distance(object): def __init__(self): pass def image(self, state): """Return the reduced representation of a state that is the only necessary portion needed for calculating the distance between two states. This is useful for storing "images" of the states that are much smaller than the potentially very large states. This is the abstract implementation which just returns the whole state, as a default for all subclasses. Overriding this will customize this functionality without having to also override the distance method. """ return state def image_distance(self, image_a, image_b): """The image_distance is the distance function computed between the exact images necessary for the resultant distance value. The `distance` function is just a wrapper around this function which first gets the images from valid states. This needs to be implemented in subclasses of Distance. """ raise NotImplementedError def distance(self, state_a, state_b): """ Compute the distance between two states. """ return self.image_distance(self.image(state_a), self.image(state_b)) PK!tI>I>#wepy/resampling/distances/openmm.pyfrom copy import copy from copy import deepcopy import logging import numpy as np import numpy.linalg as la import simtk.unit as unit import mdtraj as mdj from geomm.recentering import recenter_pair from geomm.rmsd import calc_rmsd from wepy.resampling.distances.distance import Distance class OpenMMDistance(Distance): """ Class for distance metrics that take in OpenMM walkers and return matrix of distances """ def _xyz_from_walkers(self, walkers, keep_atoms=[]): if len(keep_atoms) == 0: keep_atoms = range(np.shape(walkers[0].positions)[0]) return np.stack(([np.array(w.state.positions.value_in_unit(unit.nanometer))[keep_atoms,:] for w in walkers]),axis=0) def _box_from_walkers(self, walkers): return np.stack(([np.array([la.norm(v._value) for v in w.state.box_vectors]) for w in walkers]),axis=0) class OpenMMUnbindingDistance(OpenMMDistance): # The distance function here returns a distance matrix where the element (d_ij) is the # RMSD between walkers i and j. The RMSD is computed using the geomm package, by aligning # to the binding site atoms, and taking the RMSD of the ligand atoms. It uses alternative maps # for the binding site atoms when defined, and aligns to all alternative maps, returning the # minimum RMSD computed over all maps. def __init__(self, topology=None, ligand_idxs=None, binding_site_idxs=None, alt_maps=None): self.topology = topology self.ligand_idxs = ligand_idxs self.binding_site_idxs = binding_site_idxs self.alt_maps = alt_maps # alt_maps are alternative mappings to the binding site. this # program now assumes that all atoms in alternative maps are # contained in binding_site_idxs list def score(self, walkers): num_walkers = len(walkers) small_lig_idxs = np.array(range(len(self.ligand_idxs))) small_bs_idxs = np.array(range(len(self.ligand_idxs), len(self.ligand_idxs) + len(self.binding_site_idxs))) keep_atoms = np.concatenate((self.ligand_idxs, self.binding_site_idxs), axis=0) small_pos = self._xyz_from_walkers(walkers, keep_atoms) box_lengths = self._box_from_walkers(walkers) newpos_small = np.zeros_like(small_pos) for frame_idx, positions in enumerate(small_pos): newpos_small[frame_idx, :, :] = recenter_pair(positions, box_lengths[frame_idx], small_lig_idxs, small_bs_idxs) small_top = self.topology.subset(keep_atoms) traj_rec = mdj.Trajectory(newpos_small, small_top) traj_rec.superpose(traj_rec, atom_indices=small_bs_idxs) dist_mat = np.zeros((num_walkers, num_walkers)) for i in range(num_walkers-1): dist_mat[i][i] = 0 for j in range(i+1, num_walkers): # return the distance matrix in Angstroms dist_mat[i][j] = 10.0 * calc_rmsd(traj_rec.xyz[i], traj_rec.xyz[j], small_lig_idxs) dist_mat[j][i] = dist_mat[i][j] if self.alt_maps is not None: # figure out the "small" alternative maps small_alt_maps = deepcopy(self.alt_maps) for i, a in enumerate(self.alt_maps): for j, e in enumerate(a): try: small_alt_maps[i][j] = list(self.binding_site_idxs).index(e) +\ len(self.ligand_idxs) except: raise Exception( 'Alternative maps are assumed to be permutations of existing' ' binding site indices') for alt_map in small_alt_maps: alt_traj_rec = mdj.Trajectory(newpos_small, small_top) alt_traj_rec.superpose(alt_traj_rec, atom_indices=small_bs_idxs, ref_atom_indices=alt_map) for i in range(num_walkers-1): for j in range(i+1, num_walkers): dist = calc_rmsd(traj_rec.xyz[i], alt_traj_rec.xyz[j], small_lig_idxs) if dist < dist_mat[i][j]: dist_mat[i][j] = dist dist_mat[j][i] = dist return dist_mat class OpenMMRebindingDistance(OpenMMDistance): # The distance function here returns a distance matrix where the element (d_ij) is the # difference between 1/RMSD_0(i) and 1/RMSD_0(j). Where RMSD_0(i) is the RMSD of walker i # to the reference structure (comp_xyz), which is typically the crystallographic bound state. # The RMSDs to the bound state are computed using the geomm package, by aligning # to the binding site atoms, and taking the RMSD of the ligand atoms. It uses alternative maps # for the binding site atoms when defined, and aligns to all alternative maps, returning the # minimum RMSD computed over all maps. def __init__(self, topology=None, ligand_idxs=None, binding_site_idxs=None, alt_maps=None, comp_xyz=None): self.topology = topology self.ligand_idxs = ligand_idxs self.binding_site_idxs = binding_site_idxs self.alt_maps = alt_maps self.comp_traj = self._make_comp_traj(comp_xyz) # alt_maps are alternative mappings to the binding site # this program now assumes that all atoms in alternative maps are contained in binding_site_idxs list # comp_xyz are the xyz coordinates of a reference state (usually the bound state) # this assumes that the xyz comes from the same topology def _make_comp_traj(self, comp_xyz): small_lig_idxs = np.array(range(len(self.ligand_idxs))) small_bs_idxs = np.array(range(len(self.ligand_idxs), len(self.ligand_idxs)+len(self.binding_site_idxs))) keep_atoms = np.concatenate((self.ligand_idxs, self.binding_site_idxs), axis=0) small_top = self.topology.subset(keep_atoms) small_pos = np.array(comp_xyz)[:,keep_atoms,:] return mdj.Trajectory(small_pos, small_top) def get_rmsd_native(self, walkers): num_walkers = len(walkers) small_lig_idxs = np.array(range(len(self.ligand_idxs))) small_bs_idxs = np.array(range(len(self.ligand_idxs), len(self.ligand_idxs)+len(self.binding_site_idxs))) keep_atoms = np.concatenate((self.ligand_idxs, self.binding_site_idxs), axis=0) small_pos = self._xyz_from_walkers(walkers, keep_atoms) box_lengths = self._box_from_walkers(walkers) newpos_small = np.zeros_like(small_pos) for frame_idx, positions in enumerate(small_pos): newpos_small[frame_idx, :, :] = recenter_pair(positions, box_lengths[frame_idx], small_lig_idxs, small_bs_idxs) small_top = self.topology.subset(keep_atoms) traj_rec = mdj.Trajectory(newpos_small, small_top) traj_rec.superpose(self.comp_traj, atom_indices=small_bs_idxs) rmsd_native = np.zeros((num_walkers)) for i in range(num_walkers): rmsd_native[i] = calc_rmsd(traj_rec.xyz[i], self.comp_traj.xyz[0], small_lig_idxs) if self.alt_maps is not None: # figure out the "small" alternative maps small_alt_maps = deepcopy(self.alt_maps) for i, a in enumerate(self.alt_maps): for j, e in enumerate(a): try: small_alt_maps[i][j] = list(self.binding_site_idxs).index(e) +\ len(self.ligand_idxs) except: raise Exception( 'Alternative maps are assumed to be permutations of' ' existing binding site indices') for alt_map in small_alt_maps: alt_traj_rec = mdj.Trajectory(newpos_small,small_top) alt_traj_rec.superpose(self.comp_traj, atom_indices=small_bs_idxs, ref_atom_indices=alt_map) for i in range(num_walkers): dist = calc_rmsd(alt_traj_rec.xyz[i], self.comp_traj.xyz[0], small_lig_idxs) if dist < rmsd_native[i]: rmsd_native[i] = dist return rmsd_native def score(self, walkers): num_walkers = len(walkers) rmsd_native = self.get_rmsd_native(walkers) dist_mat = np.zeros((num_walkers, num_walkers)) for i in range(num_walkers-1): dist_mat[i][i] = 0 for j in range(i+1,num_walkers): dist_mat[i][j] = abs(1./rmsd_native[i] - 1./rmsd_native[j]) dist_mat[j][i] = dist_mat[i][j] return dist_mat class OpenMMNormalModeDistance(OpenMMDistance): # The distance function here returns a distance matrix where the element (d_ij) is the # distance in "normal mode space". The NM coordinates are determined by aligning a structure to # align_xyz, and obtaining the dot product of a subset of coordinates (specified by align_idxs, typically # C-alphas), to a set of modes contained in modefile. def __init__(self, topology=None, align_idxs=None, align_xyz=None, n_modes=5, modefile=None): self.topology = topology self.n_modes = n_modes self.align_idxs = align_idxs assert len(align_xyz[0]) == len(align_idxs), "align_xyz and align_idxs must have the same number of atoms" self.small_top = self.topology.subset(align_idxs) self.align_traj = mdj.Trajectory(align_xyz, small_top) try: modes = np.loadtxt(modefile) except: raise Exception('Error reading from modefile: ',modefile) for m in modes.T: assert len(m) == 3*len(align_idxs), "Number of elements in each mode must be 3X the number of atoms" self.modes = modes.T def score(self, walkers): num_walkers = len(walkers) keep_atoms = np.array(self.align_idxs) small_pos = self._xyz_from_walkers(walkers,keep_atoms) box_lengths = self._box_from_walkers(walkers) traj_rec = mdj.Trajectory(small_pos,self.small_top) traj_rec.superpose(self.align_traj) vecs = [np.zeros((self.n_modes)) for i in range(n_walkers)] for i in range(n_walkers): coor_angstroms = traj_rec.xyz[i,:,:].flatten()*10.0 for modenum in range(self.n_modes): vecs[i][modenum] = np.dot(coor_angstroms, self.modes[modenum]) # calculate distance matrix in normal mode space dist_mat = np.zeros((n_walkers,n_walkers)) for i in range(n_walkers): for j in range(i+1, n_walkers): dist = np.linalg.norm(vecs[i]-vecs[j],ord=2) dist_mat[i][j] = dist dist_mat[j][i] = dist return dist_mat class OpenMMHBondDistance(OpenMMDistance): # The distance function here returns a distance matrix where the # element (d_ij) is the distance in "interaction space". A vector # is built for each structure where the elements describe the # presence of a particular hydrogen bond between two atom # selections (ligand_idxs and binding_site_idxs). The hydrogen # bonds are enumerated and detected by the mastic package. def __init__(self, ligand_idxs=None, protein_idxs=None, profiler=None, sys_type=None, dsmall=3.5, dlarge=6.0, ang_min=100): self.ligand_idxs = ligand_idxs self.protein_idxs = protein_idxs self.profiler = profiler self.sys_type = sys_type # list of interactions, which define axes in the interaction # space, will grow as sim goes on self.inte_list = [] self.dsmall = dsmall self.dlarge = dlarge self.ang_min = ang_min def quantify_inte(self, ang, dist): # smoothly quantifies the presence of a hydrogen bond given # the angle and distance if dist < self.dsmall: ds = 1 else: ds = 1.0 - (dist - self.dsmall)/(self.dlarge - self.dsmall) if ds < 0: ds = 0 if ang > 180: ang -= 360 if ang < 0: ang = -ang if ang > self.ang_min: ang_s = 1 else: ang_s = 1.0 - (self.ang_min - ang)/self.ang_min if ang_s < 0: ang_s = 0 strength = ds*ang_s return strength def vectorize_profiles(self, profiles, n_walkers): inte_vec = [np.array([]) for i in range(n_walkers)] for i in range(n_walkers): inte_vec[i] = np.zeros(np.size(self.inte_list)) # loop through interactions in profile names = profiles[i].hit_idxs n = len(names) angles = [profiles[i].hit_inx_records()[j].angle for j in range(n)] dists = [profiles[i].hit_inx_records()[j].distance for j in range(n)] for j in range(n): if names[j] in self.inte_list: index = self.inte_list.index(names[j]) inte_vec[i][index] = self.quantify_inte(angles[j],dists[j]) else: self.inte_list.append(names[j]) inte_vec[i] = np.append(inte_vec[i],self.quantify_inte(angles[j],dists[j])) # pad inte_vecs with zeros, if necessary for i in range(n_walkers): if inte_vec[i].size != len(self.inte_list): to_add = len(self.inte_list) - inte_vec[i].size inte_vec[i] = np.append(inte_vec[i],np.zeros(to_add)) return inte_vec, names def distance(self, walkers): n_walkers = len(walkers) keep_atoms = np.concatenate((self.ligand_idxs, self.protein_idxs),axis=0) small_lig_idxs = np.array(range(len(self.ligand_idxs))) small_prot_idxs = np.array(range(len(self.ligand_idxs), len(self.ligand_idxs)+len(self.protein_idxs))) # recenter protein and ligand small_pos = self._xyz_from_walkers(walkers, keep_atoms) box_lengths = self._box_from_walkers(walkers) newpos_small = np.zeros_like(small_pos) for frame_idx, positions in enumerate(small_pos): newpos_small[frame_idx, :, :] = recenter_pair(positions, box_lengths[frame_idx], small_lig_idxs, small_bs_idxs) # profile ligand-protein interactions for each walker (in parallel) profiles = [[] for i in range(n_walkers)] for i in range(n_walkers): # pass coordinates in angstroms coords = [10*newpos_small[i][small_lig_idxs], 10*newpos_small[i][small_prot_idxs]] system = self.sys_type.to_system(coords) profiles[i] = self.profiler.profile(system) # vectorize profiles (in serial) inte_vec, names = self.vectorize_profiles(profiles, n_walkers) # calculate distance matrix in interaction space dist_mat = np.zeros((n_walkers, n_walkers)) for i in range(n_walkers): for j in range(i+1, n_walkers): dist = np.linalg.norm(inte_vec[i]-inte_vec[j], ord=2) dist_mat[i][j] = dist dist_mat[j][i] = dist return dist PK!'wepy/resampling/distances/randomwalk.py""" This module here is part of RandomWalk object that implements computing distance between pairs of positions of RandomWalk walkers. """ import logging import numpy as np from wepy.resampling.distances.distance import Distance class RandomWalkDistance(Distance): """ Computes the distance between pairs of positions and returns a distance matrix where the element (d_ij) is the average of the difference between posiotion of walker i and j. """ def __init__(self): pass def image(self, state): return state['positions'] def image_distance(self, image_a, image_b): """Compute the distance between posiotion of two states. :param position_a: posiotion of first state :param position_b: posiotion of second state :returns: a distance value :rtype: float """ return np.average(np.abs(image_a - image_b)) PK!&8%wepy/resampling/distances/receptor.pyimport logging import numpy as np from wepy.util.util import box_vectors_to_lengths_angles from geomm.recentering import recenter_pair from geomm.superimpose import superimpose from geomm.rmsd import calc_rmsd from wepy.resampling.distances.distance import Distance class UnbindingDistance(Distance): def __init__(self, ligand_idxs, binding_site_idxs, ref_state): # the idxs of the ligand and binding site from the whole state self._lig_idxs = ligand_idxs self._bs_idxs = binding_site_idxs # number of atoms in each self._n_lig_atoms = len(self._lig_idxs) self._n_bs_atoms = len(self._bs_idxs) # the idxs used for the whole image self._image_idxs = np.concatenate( (self._lig_idxs, self._bs_idxs) ) # the idxs of the ligand and binding site within the image self._image_lig_idxs = np.arange(self._n_lig_atoms) self._image_bs_idxs = np.arange(self._n_lig_atoms, self._n_lig_atoms + self._n_bs_atoms) # save the reference state's image so we can align all further # images to it self.ref_image = self._unaligned_image(ref_state) def _unaligned_image(self, state): # get the box lengths from the vectors box_lengths, box_angles = box_vectors_to_lengths_angles(state['box_vectors']) # recenter the protein-ligand complex into the center of the # periodic boundary conditions rece_positions = recenter_pair(state['positions'], box_lengths, self._bs_idxs, self._lig_idxs) # slice these positions to get the image state_image = rece_positions[self._image_idxs] return state_image def image(self, state): # get the unaligned image state_image = self._unaligned_image(state) # then superimpose it to the reference structure sup_image = superimpose(self.ref_image, state_image, idxs=self._image_bs_idxs) return sup_image def image_distance(self, image_a, image_b): # then we calculate the rmsd of only the ligands between the # images lig_rmsd = calc_rmsd(image_a, image_b, idxs=self._image_lig_idxs) return lig_rmsd PK!&wepy/resampling/resamplers/__init__.pyPK!^S(S($wepy/resampling/resamplers/random.pyimport logging from wepy.resampling.resamplers.resampler import Resampler # for the framework from wepy.resampling.deciders.clone_merge import RandomCloneMergeDecider from wepy.resampling.scoring.scorer import RandomScorer # for the monolithic resampler import random as rand from wepy.resampling.decisions.clone_merge import MultiCloneMergeDecision import random as rand from wepy.resampling.decisions.clone_merge import CloneMergeDecision class RandomCloneMergeDecider(Resampler): """ WIP not working!! """ N_CLONES = 2 MIN_N_WALKERS = N_CLONES + 1 MIN_MERGE = 2 DECISION = CloneMergeDecision INSTRUCTIONS = dict(DECISION.INSTRUCTION_RECORDS) def __init__(self, seed=None): raise NotImplementedError("WIP do not use") if seed is not None: self.seed = seed rand.seed(seed) def decide(self, novelties): n_walkers = len(novelties) # decide the maximum number of splittings that can be done in # one decision step, while keeping at least one merge target max_n_splits = (n_walkers-1) // self.N_CLONES # check to make sure there is enough walkers to clone and merge if max_n_splits < 1: raise TypeError("There must be at least 3 walkers to do cloning and merging") # choose a number of splittings to perform n_splits = rand.randint(1, max_n_splits) # the number of merges will be the same decisions = [None for _ in novelties] # for the number of splittings to do choose walkers to split # randomly split_idxs = rand.sample(list(range(n_walkers)), n_splits) # choose the target slots for these walkers randomly avail_slot_idxs = set(range(n_walkers)) for split_idx in split_idxs: # take some of these slots to put clones into split_target_idxs = rand.sample(avail_slot_idxs, self.N_CLONES) # remove these from the available target slots avail_slot_idxs.difference_update(split_target_idxs) # save this decision and instruction for this split # walker decisions[split_idx] = self.DECISION.record(self.DECISION.ENUM.CLONE.value, split_target_idxs) # make a set of the walkers available for merging avail_walker_idxs = set(range(n_walkers)).difference(split_idxs) # choose the target slots for the merges for merge_idx in range(n_splits): # choose the walkers to squash into this merge group merge_grp_walker_idxs = set(rand.sample(avail_walker_idxs, self.N_CLONES)) # remove these from the available walker idxs avail_walker_idxs.difference_update(merge_grp_walker_idxs) # choose the walker state to keep at random keep_idx = rand.sample(merge_grp_walker_idxs, 1)[0] squash_walker_idxs = merge_grp_walker_idxs.difference([keep_idx]) # choose a target slot to put the merged walker in merge_target_idx = rand.sample(avail_slot_idxs, 1)[0] # remove that slot from the available slots avail_slot_idxs.difference_update([merge_target_idx]) # make a record for the keep walker decisions[keep_idx] = self.DECISION.record(self.DECISION.ENUM.KEEP_MERGE.value, (merge_target_idx,)) # make records for the squashed walkers for squash_walker_idx in squash_walker_idxs: decisions[squash_walker_idx] = self.DECISION.record(self.DECISION.ENUM.SQUASH.value, (merge_target_idx,)) # all leftover actionless walkers get assigned NOTHING records for walker_idx in avail_walker_idxs: # choose a slot for this walker target_idx = rand.sample(avail_slot_idxs, 1)[0] # remove this from the available walkers avail_slot_idxs.difference_update([target_idx]) # make the record decisions[walker_idx] = self.DECISION.record(self.DECISION.ENUM.NOTHING.value, (target_idx,)) return decisions, {} class RandomCloneMergeResamplerMonolithic(Resampler): """Example of a monolithic resampler that does not use any framework. Everything is implemented from scratch in this class, thus overriding everything in the super class. """ # constants for the class DECISION = MultiCloneMergeDecision MIN_N_WALKERS = 3 def __init__(self, seed=None, n_resamplings=10): if seed is not None: self.seed = seed rand.seed(seed) self.n_resamplings = n_resamplings def resample(self, walkers): n_walkers = len(walkers) # check to make sure there is enough walkers to clone and merge if n_walkers < self.MIN_N_WALKERS: raise TypeError("There must be at least 3 walkers to do cloning and merging") # choose number of clone-merges between 1 and 10 n_clone_merges = rand.randint(0, self.n_resamplings) result_template_str = "|".join(["{:^10}" for i in range(n_walkers+1)]) logging.info("Number of clone-merges to perform: {}".format(n_clone_merges)) resampling_actions = [] for resampling_stage_idx in range(n_clone_merges): logging.info("Resampling Stage: {}".format(resampling_stage_idx)) # choose a random walker to clone clone_idx = rand.randint(0, len(walkers)-1) clone_walker = walkers[clone_idx] # clone the chosen walker clone_children = clone_walker.clone() # choose a destination slot (index in the list) to put the clone in # the walker occupying that slot will be squashed # can't choose the same slot it is in squash_available = set(range(n_walkers)).difference({clone_idx}) squash_idx = rand.choice([walker_idx for walker_idx in squash_available]) squash_walker = walkers[squash_idx] # find a random merge target that is not either of the # cloned walkers merge_available = set(range(n_walkers)).difference({clone_idx, squash_idx}) merge_idx = rand.choice([walker_idx for walker_idx in merge_available]) merge_walker = walkers[merge_idx] # merge the squashed walker with the keep_merge walker merged_walker = squash_walker.squash(merge_walker) # make a new list of walkers resampled_walkers = [] for idx, walker in enumerate(walkers): if idx == clone_idx: # put one of the cloned walkers in the cloned one's place resampled_walkers.append(clone_children.pop()) elif idx == squash_idx: # put one of the clone children in the squashed one's place resampled_walkers.append(clone_children.pop()) elif idx == merge_idx: # put the merged walker in the keep_merged walkers place resampled_walkers.append(merged_walker) else: # if they did not move put them back where they were resampled_walkers.append(walker) # reset the walkers for the next step as the resampled walkers walkers = resampled_walkers # make the decision records for this stage of resampling # initialize to RandomCloneMergeDecision.NOTHING, and their starting index walker_actions = [self.DECISION.record(self.DECISION.ENUM.NOTHING.value, (i,)) for i in range(n_walkers)] # for the cloned one make a record for the instruction walker_actions[clone_idx] = self.DECISION.record(self.DECISION.ENUM.CLONE.value, (clone_idx, squash_idx,)) # for the squashed walker walker_actions[squash_idx] = self.DECISION.record(self.DECISION.ENUM.SQUASH.value, (merge_idx,)) # for the keep-merged walker walker_actions[merge_idx] = self.DECISION.record(self.DECISION.ENUM.KEEP_MERGE.value, (merge_idx,)) resampling_actions.append(walker_actions) # walker slot indices slot_str = result_template_str.format("slot", *[i for i in range(n_walkers)]) logging.info(slot_str) # the resampling actions decisions = [] instructions = [] for rec in walker_actions: decisions.append(str(rec.decision.name)) if rec.decision is self.DECISION.ENUM.CLONE: instructions.append(str(",".join([str(i) for i in rec.instruction]))) else: instructions.append(str(rec.instruction)) decision_str = result_template_str.format("decision", *decisions) instruction_str = result_template_str.format("instruct", *instructions) logging.info(decision_str) logging.info(instruction_str) # the state of the walkers at this stage of resampling walker_state_str = result_template_str.format("state", *[str(walker.state) for walker in resampled_walkers]) logging.info(walker_state_str) walker_weight_str = result_template_str.format("weight", *[str(walker.weight) for walker in resampled_walkers]) logging.info(walker_weight_str) # return values: resampled_walkers, resampler_records, resampling_data # we return no extra data from this resampler if n_clone_merges == 0: return walkers, [], {} else: # return the final state of the resampled walkers after all # stages, and the records of resampling return resampled_walkers, resampling_actions, {} PK! k22'wepy/resampling/resamplers/resampler.pyimport itertools as it from collections import defaultdict from warnings import warn import logging import numpy as np from wepy.resampling.decisions.decision import NoDecision class ResamplerError(Exception): pass class Resampler(): # data for resampling performed (continual) RESAMPLING_FIELDS = () RESAMPLING_SHAPES = () RESAMPLING_DTYPES = () RESAMPLING_RECORD_FIELDS = () # changes to the state of the resampler (sporadic) RESAMPLER_FIELDS = () RESAMPLER_SHAPES = () RESAMPLER_DTYPES = () RESAMPLER_RECORD_FIELDS = () # valid debug modes DEBUG_MODES = (True, False,) def __init__(self, min_num_walkers=Ellipsis, max_num_walkers=Ellipsis, debug_mode=False): # the min and max number of walkers that can be generated in # resampling. # Ellipsis means to keep bound it by the number of # walkers given to the resample method (e.g. if # max_num_walkers == Ellipsis and min_num_walkers == 5 and # resample is given 10 then the max will be set to 10 for that # resampling and the min will always be 5. If both are # Ellipsis then the number of walkers is kept the same) # None means that there is no bound, e.g. max_num_walkers == # None then there is no maximum number of walkers, however a # min_num_walkers of None in practice is 1 since there must # always be at least 1 walker if min_num_walkers not in (Ellipsis, None): if min_num_walkers < 1: raise ResamplerError("The minimum number of walkers should be at least 1") self._min_num_walkers = min_num_walkers self._max_num_walkers = max_num_walkers # initialize debug mode self._debug_mode = False # set them to the args given self.set_debug_mode(debug_mode) def resampling_field_names(self): return self.RESAMPLING_FIELDS def resampling_field_shapes(self): return self.RESAMPLING_SHAPES def resampling_field_dtypes(self): return self.RESAMPLING_DTYPES def resampling_fields(self): return list(zip(self.resampling_field_names(), self.resampling_field_shapes(), self.resampling_field_dtypes())) def resampling_record_field_names(self): return self.RESAMPLING_RECORD_FIELDS def resampler_field_names(self): return self.RESAMPLER_FIELDS def resampler_field_shapes(self): return self.RESAMPLER_SHAPES def resampler_field_dtypes(self): return self.RESAMPLER_DTYPES def resampler_fields(self): return list(zip(self.resampler_field_names(), self.resampler_field_shapes(), self.resampler_field_dtypes())) def resampler_record_field_names(self): return self.RESAMPLER_RECORD_FIELDS @property def is_debug_on(self): return self._debug_mode def set_debug_mode(self, mode): if mode not in self.DEBUG_MODES: raise ValueError("debug mode, {}, not valid".format(mode)) self._debug_mode = mode # if you want to use debug mode you have to have ipdb installed if self.is_debug_on: try: import ipdb except ModuleNotFoundError: raise ModuleNotFoundError("You must have ipdb installed to use the debug feature") def debug_on(self): if self.is_debug_on: warn("Debug mode is already on") self.set_debug_mode(True) def debug_off(self): if not self.is_debug_on: warn("Debug mode is already off") self.set_debug_mode(False) @property def max_num_walkers_setting(self): return self._max_num_walkers @property def min_num_walkers_setting(self): return self._min_num_walkers def max_num_walkers(self): """" Get the max number of walkers allowed currently""" # first check to make sure that a resampling is occuring and # we have a number of walkers to even reference if self._resampling_num_walkers is None: raise ResamplerError( "A resampling is currently not taking place so the"\ " current number of walkers is not known.") # we are in a resampling so there is a current value for the # max number of walkers else: # if the max is None then there is no max number of # walkers so we just return None if self.max_num_walkers_setting is None: return None # if the max is Ellipsis then we just return what the # current number of walkers is elif self.max_num_walkers_setting is Ellipsis: return self._resampling_num_walkers # if it is not those then it is a hard number and we just # return it else: return self.max_num_walkers_setting def min_num_walkers(self): """" Get the min number of walkers allowed currently""" # first check to make sure that a resampling is occuring and # we have a number of walkers to even reference if self._resampling_num_walkers is None: raise ResamplerError( "A resampling is currently not taking place so the"\ " current number of walkers is not known.") # we are in a resampling so there is a current value for the # min number of walkers else: # if the min is None then there is no min number of # walkers so we just return None if self.min_num_walkers_setting is None: return None # if the min is Ellipsis then we just return what the # current number of walkers is elif self.min_num_walkers_setting is Ellipsis: return self._resampling_num_walkers # if it is not those then it is a hard number and we just # return it else: return self.min_num_walkers_setting def _set_resampling_num_walkers(self, num_walkers): # there must be at least 1 walker in order to do resampling if num_walkers < 1: raise ResamplerError("No walkers were given to resample") # if the min number of walkers is not dynamic check to see if # this number violates the hard boundary if self._min_num_walkers in (None, Ellipsis): self._resampling_num_walkers = num_walkers elif num_walkers < self._min_num_walkers: raise ResamplerError( "The number of walkers given to resample is less than the minimum") # if the max number of walkers is not dynamic check to see if # this number violates the hard boundary if self._max_num_walkers in (None, Ellipsis): self._resampling_num_walkers = num_walkers elif num_walkers < self._max_num_walkers: raise ResamplerError( "The number of walkers given to resample is less than the maximum") def _unset_resampling_num_walkers(self): self._resampling_num_walkers = None def _resample_init(self, walkers): """Common initialization stuff for resamplers. """ # first set how many walkers there are in this resampling self._set_resampling_num_walkers(len(walkers)) def _resample_cleanup(self): # unset the number of walkers for this resampling self._unset_resampling_num_walkers() def resample(self, walkers, debug_mode=False): raise NotImplemented self._resample_init(walkers, debug_mode=debug_mode) def _init_walker_actions(self): # determine resampling actions walker_actions = [self.decision.record(enum_value=self.decision.ENUM.NOTHING.value, target_idxs=(i,)) for i in range(n_walkers)] return walker_actions def assign_clones(self, merge_groups, walker_clone_nums): n_walkers = len(walker_clone_nums) walker_actions = self._init_walker_actions() # keep track of which slots will be free due to squashing free_slots = [] # go through the merge groups and write the records for them, # the index of a merge group determines the KEEP_MERGE walker # and the indices in the merge group are the walkers that will # be squashed for walker_idx, merge_group in enumerate(merge_groups): if len(merge_group) > 0: # add the squashed walker idxs to the list of open # slots free_slots.extend(merge_group) # for each squashed walker write a record and save it # in the walker actions for squash_idx in merge_group: walker_actions[squash_idx] = self.decision.record(self.decision.ENUM.SQUASH.value, (walker_idx,)) # make the record for the keep merge walker walker_actions[walker_idx] = self.decision.record(self.decision.ENUM.KEEP_MERGE.value, (walker_idx,)) # for each walker, if it is to be cloned assign open slots for it for walker_idx, num_clones in enumerate(walker_clone_nums): if num_clones > 0 and len(merge_groups[walker_idx]) > 0: raise ResamplerError("Error! cloning and merging occuring with the same walker") # if this walker is to be cloned do so and consume the free # slot if num_clones > 0: # we first check to see if there are any free "slots" # for cloned walkers to go. If there are not we can # make room. The number of extra slots needed should # be default 0 # we choose the targets for this cloning, we start # with the walker initiating the cloning clone_targets = [walker_idx] # if there are any free slots, then we use those first if len(free_slots) > 0: clone_targets.extend([free_slots.pop() for clone in range(num_clones)]) # if there are more slots needed then we will have to # create them num_slots_needed = num_clones - len(clone_targets) if num_slots_needed > 0: # initialize the lists of open slots new_slots = [] # and make a list of the new slots new_slots = [n_walkers + i for i in range(num_slots_needed)] # then increase the number of walkers to match n_walkers += num_slots_needed # then add these to the clone targets clone_targets.extend(new_slots) # make a record for this clone walker_actions[walker_idx] = self.decision.record(self.decision.ENUM.CLONE.value, tuple(clone_targets)) return walker_actions class ScoreDecideResampler(Resampler): """Superclass for resamplers that use the the Novelty->Decider framework.""" def __init__(self, scorer, decider): self.scorer = scorer self.decider = decider self.decision = decider.DECISION def resample(self, walkers): # first set how many walkers there are in this resampling self._set_resample_num_walkers(len(walkers)) aux_data = {} scores, scorer_aux = self.scorer.scores(walkers) decisions, decider_aux = self.decider.decide(scores) resampled_walkers = self.decider.decision.action(walkers, decisions) aux_data.update([scorer_aux, decider_aux]) # unset the number of walkers for this resampling self._unset_resampling_num_walkers() return resampled_walkers, resampling_records, resampler_records class NoResampler(Resampler): DECISION = NoDecision def __init__(self): self.decision = self.DECISION def resample(self, walkers, **kwargs): n_walkers = len(walkers) # the walker actions are all nothings with the same walker # index which is the default initialization walker_actions = self._init_walker_actions() # we only have one step so our resampling_records are just the # single list of walker actions resampling_data = [walker_actions] # there is no change in state in the resampler so there are no # resampler records resampler_data = [{}] return walkers, resampling_data, resampler_data PK!77"wepy/resampling/resamplers/revo.pyimport multiprocessing as mulproc import random as rand import itertools as it import logging import numpy as np from wepy.resampling.resamplers.resampler import Resampler from wepy.resampling.decisions.clone_merge import MultiCloneMergeDecision class REVOResampler(Resampler): DECISION = MultiCloneMergeDecision # state change data for the resampler RESAMPLER_FIELDS = ('n_walkers', 'distance_matrix', 'spread', 'image_shape', 'images') RESAMPLER_SHAPES = ((1,), Ellipsis, (1,), Ellipsis, Ellipsis) RESAMPLER_DTYPES = (np.int, np.float, np.float, np.int, None) # fields that can be used for a table like representation RESAMPLER_RECORD_FIELDS = ('spread',) # fields for resampling data RESAMPLING_FIELDS = DECISION.FIELDS + ('step_idx', 'walker_idx',) RESAMPLING_SHAPES = DECISION.SHAPES + ((1,), (1,),) RESAMPLING_DTYPES = DECISION.DTYPES + (np.int, np.int,) # fields that can be used for a table like representation RESAMPLING_RECORD_FIELDS = DECISION.RECORD_FIELDS + ('step_idx', 'walker_idx',) def __init__(self, seed=None, pmin=1e-12, pmax=0.1, dpower=4, merge_dist=2.5, distance_characteristic=None, distance=None, init_state=None, weights=True): self.decision = self.DECISION # the minimum probability for a walker self.pmin=pmin # ln(probability_min) self.lpmin = np.log(pmin/100) # maximum probability for a walker self.pmax=pmax # self.dpower = dpower # self.merge_dist = merge_dist # the distance metric assert distance is not None, "Must give a distance metric class" self.distance = distance # the distance_characteristic assert distance_characteristic is not None, "Must given a distance_characteristic value" self.distance_characteristic = distance_characteristic # setting the random seed self.seed = seed if seed is not None: rand.seed(seed) # setting the weights parameter self.weights = weights # we do not know the shape and dtype of the images until # runtime so we determine them here assert init_state is not None, "must give an initial state to infer data about the image" image = self.distance.image(init_state) self.image_dtype = image.dtype # we need this to on the fly find out what the datatype of the # image is def resampler_field_dtypes(self): # index of the image idx image_idx = self.resampler_field_names().index('images') # dtypes adding the image dtype dtypes = list(super().resampler_field_dtypes()) dtypes[image_idx] = self.image_dtype return tuple(dtypes) def _calcspread(self, walkerwt, amp, distance_matrix): n_walkers = len(walkerwt) # the value to be optimized spread = 0 # wsum = np.zeros(n_walkers) # weight factors for the walkers wtfac = np.zeros(n_walkers) # set the weight factors for i in range(n_walkers): if walkerwt[i] > 0 and amp[i] > 0: if self.weights: wtfac[i] = np.log(walkerwt[i]/amp[i]) - self.lpmin else: wtfac[i] = 1 else: wtfac[i] = 0 if wtfac[i] < 0: wtfac[i] = 0 # for i in range(n_walkers - 1): if amp[i] > 0: for j in range(i+1, n_walkers): if amp[j] > 0: d = ((distance_matrix[i][j]/self.distance_characteristic)**self.dpower) * wtfac[i] * wtfac[j] spread += d * amp[i] * amp[j] wsum[i] += d * amp[j] wsum[j] += d * amp[i] # another implementation for personal clarity # for i, j in it.combinations(range(len(n_walkers)), 2): # if amp[i] > 0 and amp[j] > 0: # d = ((distance_matrix[i][j])**self.dpower) * wtfac[i] * wtfac[j] # spread += d * amp[i] * amp[j] # wsum[i] = += d * amp[j] # wsum[j] += d * amp[i] return spread, wsum def decide_clone_merge(self, walkerwt, amp, distance_matrix): n_walkers = len(walkerwt) spreads = [] merge_groups = [[] for i in range(n_walkers)] walker_clone_nums = [0 for i in range(n_walkers)] new_wt = walkerwt.copy() new_amp = amp.copy() # initialize the actions to nothing, will be overwritten # calculate the initial spread which will be optimized spread, wsum = self._calcspread(walkerwt, new_amp, distance_matrix) spreads.append(spread) # maximize the variance through cloning and merging logging.info("Starting variance optimization:", spread) productive = True while productive: productive = False # find min and max wsums, alter new_amp # initialize to None, we may not find one of each minwind = None maxwind = None # selects a walker with minimum wsum and a walker with # maximum wsum walker (distance to other walkers) will be # tagged for cloning (stored in maxwind), except if it is # already a keep merge target max_tups = [] for i, value in enumerate(wsum): # 1. must have an amp >=1 which gives the number of clones to be made of it # 2. clones for the given amplitude must not be smaller than the minimum probability # 3. must not already be a keep merge target if (new_amp[i] >= 1) and \ (new_wt[i]/(new_amp[i] + 1) > self.pmin) and \ (len(merge_groups[i]) == 0): max_tups.append((value, i)) if len(max_tups) > 0: maxvalue, maxwind = max(max_tups) # walker with the lowest wsum (distance to other walkers) # will be tagged for merging (stored in minwind) min_tups = [(value, i) for i,value in enumerate(wsum) if new_amp[i] == 1 and (new_wt[i] < self.pmax)] if len(min_tups) > 0: minvalue, minwind = min(min_tups) # does minwind have an eligible merging partner? # closedist = self.merge_dist closewalk = None condition_list = np.array([i is not None for i in [minwind, maxwind]]) if condition_list.all() and minwind != maxwind: # get the walkers that aren't the minimum and the max # wsum walkers, as candidates for merging closewalks = set(range(n_walkers)).difference([minwind, maxwind]) # remove those walkers that if they were merged with # the min wsum walker would violate the pmax closewalks = [idx for idx in closewalks if (new_amp[idx]==1) and (new_wt[idx] + new_wt[minwind] < self.pmax) ] # if there are any walkers left, get the distances of # the close walkers to the min wsum walker if that # distance is less than the maximum merge distance if len(closewalks) > 0: closewalks_dists = [(distance_matrix[minwind][i], i) for i in closewalks if distance_matrix[minwind][i] < (self.merge_dist)] # if any were found set this as the closewalk if len(closewalks_dists) > 0: closedist, closewalk = min(closewalks_dists) # did we find a closewalk? condition_list = np.array([i is not None for i in [minwind, maxwind, closewalk]]) if condition_list.all() : # change new_amp tempsum = new_wt[minwind] + new_wt[closewalk] new_amp[minwind] = new_wt[minwind]/tempsum new_amp[closewalk] = new_wt[closewalk]/tempsum new_amp[maxwind] += 1 # re-determine spread function, and wsum values newspread, wsum = self._calcspread(new_wt, new_amp, distance_matrix) if newspread > spread: spreads.append(newspread) logging.info("Variance move to", newspread, "accepted") productive = True spread = newspread # make a decision on which walker to keep # (minwind, or closewalk), equivalent to: # `random.choices([closewalk, minwind], # weights=[new_wt[closewalk], new_wt[minwind])` r = rand.uniform(0.0, new_wt[closewalk] + new_wt[minwind]) # keeps closewalk and gets rid of minwind if r < new_wt[closewalk]: keep_idx = closewalk squash_idx = minwind # keep minwind, get rid of closewalk else: keep_idx = minwind squash_idx = closewalk # if keep_idx == maxwind: # import ipdb; ipdb.set_trace() # if len(merge_groups[maxwind]) > 0: # import ipdb; ipdb.set_trace() # print("Attempting to clone a walker which is a keep idx of a merge group") # if walker_clone_nums[keep_idx] > 0: # import ipdb; ipdb.set_trace() # print("Attempting to merge a walker which is to be cloned") # update weight new_wt[keep_idx] += new_wt[squash_idx] new_wt[squash_idx] = 0.0 # update new_amps new_amp[squash_idx] = 0 new_amp[keep_idx] = 1 # add the squash index to the merge group merge_groups[keep_idx].append(squash_idx) # add the indices of the walkers that were already # in the merge group that was just squashed merge_groups[keep_idx].extend(merge_groups[squash_idx]) # reset the merge group that was just squashed to empty merge_groups[squash_idx] = [] # increase the number of clones that the cloned # walker has walker_clone_nums[maxwind] += 1 # new spread for starting new stage newspread, wsum = self._calcspread(new_wt, new_amp, distance_matrix) spreads.append(newspread) logging.info("variance after selection:", newspread) # if not productive else: new_amp[minwind] = 1 new_amp[closewalk] = 1 new_amp[maxwind] -= 1 # given we know what we want to clone to specific slots # (squashing other walkers) we need to determine where these # squashed walkers will be merged walker_actions = self.assign_clones(merge_groups, walker_clone_nums) # because there is only one step in resampling here we just # add another field for the step as 0 and add the walker index # to its record as well for walker_idx, walker_record in enumerate(walker_actions): walker_record['step_idx'] = np.array([0]) walker_record['walker_idx'] = np.array([walker_idx]) return walker_actions, spreads[-1] def _all_to_all_distance(self, walkers): # initialize an all-to-all matrix, with 0.0 for self distances dist_mat = np.zeros((len(walkers), len(walkers))) # make images for all the walker states for us to compute distances on images = [] for walker in walkers: image = self.distance.image(walker.state) images.append(image) # get the combinations of indices for all walker pairs for i, j in it.combinations(range(len(images)), 2): # calculate the distance between the two walkers dist = self.distance.image_distance(images[i], images[j]) # save this in the matrix in both spots dist_mat[i][j] = dist dist_mat[j][i] = dist return [walker_dists for walker_dists in dist_mat], images def resample(self, walkers): n_walkers = len(walkers) walkerwt = [walker.weight for walker in walkers] amp = [1 for i in range(n_walkers)] # calculate distance matrix distance_matrix, images = self._all_to_all_distance(walkers) logging.info("distance_matrix") logging.info(np.array(distance_matrix)) # determine cloning and merging actions to be performed, by # maximizing the spread, i.e. the Decider resampling_data, spread = self.decide_clone_merge(walkerwt, amp, distance_matrix) # convert the target idxs and decision_id to feature vector arrays for record in resampling_data: record['target_idxs'] = np.array(record['target_idxs']) record['decision_id'] = np.array([record['decision_id']]) # actually do the cloning and merging of the walkers resampled_walkers = self.decision.action(walkers, [resampling_data]) # flatten the distance matrix and give the number of walkers # as well for the resampler data, there is just one per cycle resampler_data = [{'distance_matrix' : np.ravel(np.array(distance_matrix)), 'n_walkers' : np.array([len(walkers)]), 'spread' : np.array([spread]), 'images' : np.ravel(np.array(images)), 'image_shape' : np.array(images[0].shape)}] return resampled_walkers, resampling_data, resampler_data PK! Wݜ;;&wepy/resampling/resamplers/wexplore.pyimport math import random as rand import itertools as it from collections import namedtuple, defaultdict from copy import copy, deepcopy import logging import numpy as np import networkx as nx from wepy.resampling.resamplers.resampler import Resampler, ResamplerError from wepy.resampling.decisions.clone_merge import MultiCloneMergeDecision class RegionTreeError(Exception): pass ## Merge methods # algorithms for finding the number of mergeable walkers in a group def calc_squashable_walkers_single_method(walker_weights, max_weight): # to get an estimate of the number of squashable walkers we start # summing the weights starting from the smallest walker. When the # addition of the next highest weight walker would make the total # greater than max_weight then we quit and say that the number of # squashable walkers is the number of them summed up, minus one # for the fact that one of them won't be squashed if a merge of # all of them was to occur n_squashable = 0 # there must be at least 2 walkers in order to be able to do a # merge, so if there are not enough the number of squashable # walkers is 0 if len(walker_weights) < 2: return n_squashable # sort the weights smallest to biggest walker_weights.sort() idx = 0 sum_weights = walker_weights[idx] merge_size = 1 while sum_weights <= max_weight: # if the next index would be out of bounds break out of the # loop if idx + 1 >= len(walker_weights): break else: idx += 1 # add this walker to the sum weights sum_weights += walker_weights[idx] # add one to the merge size (since we only will make our # estimate based on the single largest possible merge) merge_size += 1 else: # the loop condition failed so we remove the last count of # merge size from the merge group. This won't run if we break # out of the loop because of we are out of walkers to include merge_size -= 1 # then we also take one less than that as the number of # squashable walkers n_squashable = merge_size - 1 return n_squashable # algorithms for actually generating the merge groups def decide_merge_groups_single_method(walker_weights, balance, max_weight): assert balance < 0, "target balance must be negative" # the number of walkers we need to choose in order to be # able to do the required amount of merges num_merge_walkers = abs(balance) + 1 # select the lowest weight walkers to use for merging, these # are idxs on the mergeable walkers and not the walker_idxs chosen_idxs = np.argsort(walker_weights)[:num_merge_walkers] # check that this is not greater than the max weight if sum([walker_weights[chosen_idx] for chosen_idx in chosen_idxs]) > max_weight: result = False else: result = True # return the chosen idxs as the sole full merge group return [chosen_idxs], result ## Clone methods def calc_max_num_clones(walker_weight, min_weight, max_num_walkers): # initialize it to no more clones max_n_clones = 0 # start with a two splitting n_splits = 2 # then increase it every time it passes or until we get to the # max number of walkers while ((walker_weight / n_splits) >= min_weight) and \ (n_splits <= max_num_walkers): n_splits += 1 # the last step failed so the number of splits is one less # then we counted n_splits -= 1 # we want the number of clones so we subtract one from the # number of splits to get that, and we save this for this # walker max_n_clones = n_splits - 1 return max_n_clones class RegionTree(nx.DiGraph): # the strings for choosing a method of solving how deciding how # many walkers can be merged together given a group of walkers and # the associated algorithm for actually choosing them MERGE_METHODS = ('single',) # Description of the methods # 'single' : this method simplifies the problem (likely giving # very suboptimal solutions especially early in sampling when # walkers are of similar large weights) by enforcing that within a # group of walkers (i.e. in a leaf region node) only one merge # will take place. To decide how large a given merge group can be # then is simply found by consecutively summing the weights of the # smallest walkers until the inclusion of the next highest # violates the maximum weight. Thus the algorithm for actually # finding the walkers that shall be merged is as simple as taking # the K lowest walkers given by the first algorithm. This is then # guaranteed to satisfy the potential. # as further methods are mathematically proven and algorithms # designed this will be the chosen method. ROOT_NODE = () def __init__(self, init_state, max_n_regions=None, max_region_sizes=None, distance=None, pmin=None, pmax=None, merge_method='single'): super().__init__() if (max_n_regions is None) or \ (max_region_sizes is None) or \ (distance is None) or \ (pmin is None) or \ (pmax is None): raise ValueError("All parameters must be defined, 1 or more are missing.") self._max_n_regions = max_n_regions self._n_levels = len(max_n_regions) self._max_region_sizes = max_region_sizes self._distance = distance self._pmin = pmin self._pmax = pmax # initialize the max and min number of walkers, this is a # dynamic thing and is manually set by the WExploreResampler self._max_num_walkers = False self._min_num_walkers = False assert merge_method in self.MERGE_METHODS, \ "the merge method given, '{}', must be one of the methods available {}".format( merge_method, self.MERGE_METHODS) self._merge_method = merge_method self._walker_weights = [] self._walker_assignments = [] image_idx = 0 # get the image using the distance object image = self.distance.image(init_state) self._images = [image] parent_id = self.ROOT_NODE self.add_node(parent_id, image_idx=0, n_walkers=0, n_squashable=0, n_possible_clones=0, balance=0, walker_idxs=[]) # make the first branch for level in range(len(max_n_regions)): child_id = parent_id + (0,) self.add_node(child_id, image_idx=image_idx, n_walkers=0, n_squashable=0, n_possible_clones=0, balance=0, walker_idxs=[]) self.add_edge(parent_id, child_id) parent_id = child_id # add the region for this branch to the regions list self._regions = [tuple([0 for i in range(self._n_levels)])] @property def merge_method(self): return self._merge_method @property def distance(self): return self._distance @property def images(self): return self._images @property def max_n_regions(self): return self._max_n_regions @property def n_levels(self): return self._n_levels @property def max_region_sizes(self): return self._max_region_sizes @property def pmin(self): return self._pmin @property def pmax(self): return self._pmax @property def walker_assignments(self): return self._walker_assignments @property def walker_weights(self): return self._walker_weights @property def regions(self): return self._regions def add_child(self, parent_id, image_idx): # make a new child id which will be the next index of the # child with the parent id child_id = parent_id + (len(self.children(parent_id)), ) # create the node with the image_idx self.add_node(child_id, image_idx=image_idx, n_walkers=0, n_squashable=0, n_possible_clones=0, balance=0, walker_idxs=[]) # make the edge to the child self.add_edge(parent_id, child_id) return child_id def children(self, parent_id): children_ids = list(self.adj[parent_id].keys()) # sort them children_ids.sort() return children_ids def level_nodes(self, level): """Get the nodes/regions at the specified level.""" if level > self.n_levels: raise ValueError("level is greater than the number of levels for this tree") return [node_id for node_id in self.nodes if len(node_id) == level] def leaf_nodes(self): return self.level_nodes(self.n_levels) def branch_tree(self, parent_id, image): # add the new image to the image index image_idx = len(self._images) self._images.append(image) branch_level = len(parent_id) # go down from there and create children for level in range(branch_level, self.n_levels): child_id = self.add_child(parent_id, image_idx) parent_id = child_id #add new assignment to the image assignments self._regions.append(child_id) # return the leaf node id of the new branch return child_id @property def max_num_walkers(self): return self._max_num_walkers @max_num_walkers.setter def max_num_walkers(self, max_num_walkers): """This must be an integer.""" self._max_num_walkers = max_num_walkers @max_num_walkers.deleter def max_num_walkers(self, max_num_walkers): """This must be an integer.""" self._max_num_walkers = None @property def min_num_walkers(self): return self._min_num_walkers @min_num_walkers.setter def min_num_walkers(self, min_num_walkers): """This must be an integer.""" self._min_num_walkers = min_num_walkers @min_num_walkers.deleter def min_num_walkers(self, min_num_walkers): """This must be an integer.""" self._min_num_walkers = None def assign(self, state): assignment = [] dists = [] # a cache for the distance calculations so they need not be # performed more than once dist_cache = {} # perform a n-ary search through the hierarchy of regions by # performing a distance calculation to the images at each # level starting at the top node = self.ROOT_NODE for level in range(self.n_levels): level_nodes = self.children(node) # perform a distance calculation to all nodes at this # level image_dists = [] for level_node in level_nodes: # get the image image_idx = self.node[level_node]['image_idx'] image = self.images[image_idx] # if this distance is already calculated don't # calculate it again and just get it from the cache if image_idx in dist_cache: dist = dist_cache[image_idx] # otherwise calculate it and save it in the cache else: # image of the state state_image = self.distance.image(state) # there is the possibility of try: dist = self.distance.image_distance(state_image, image) except ValueError: print("state: ", state.dict()) print("state_image: ", state_image) print("image: ", image) raise ValueError("If you have triggered this error you have" " encountered a rare bug. Please attempt to" " report this using the printed outputs.") # save in the dist_cache dist_cache[image_idx] = dist # add it to the dists for this state image_dists.append(dist) # get the index of the image that is closest level_closest_child_idx = np.argmin(image_dists) # get the distance for the closest image level_closest_image_dist = image_dists[level_closest_child_idx] # save for return assignment.append(level_closest_child_idx) dists.append(level_closest_image_dist) # set this node as the next node node = level_nodes[level_closest_child_idx] return tuple(assignment), tuple(dists) def clear_walkers(self): """Remove all walkers from the regions.""" # reset the walker assignments to an empty list self._walker_assignments = [] self._walker_weights = [] # set all the node attributes to their defaults for node_id in self.nodes: self.node[node_id]['n_walkers'] = 0 self.node[node_id]['walker_idxs'] = [] self.node[node_id]['n_squashable'] = 0 self.node[node_id]['n_possible_clones'] = 0 self.node[node_id]['balance'] = 0 def place_walkers(self, walkers): # clear all the walkers and reset node attributes to defaults self.clear_walkers() # keep track of new branches made new_branches = [] # place each walker for walker_idx, walker in enumerate(walkers): # assign the state of the walker to the tree and get the # distances to the images at each level assignment, distances = self.assign(walker.state) # check the distances going down the levels to see if a # branching (region creation) is necessary for level, distance in enumerate(distances): # if we are over the max region distance and we are # not above max number of regions we have found a new # region so we branch the region_tree at that level if distance > self.max_region_sizes[level] and \ len(self.children(assignment[:level])) < self.max_n_regions[level]: # make an image for the region image = self.distance.image(walker.state) parent_id = assignment[:level] # make the new branch assignment = self.branch_tree(parent_id, image) # save it to keep track of new branches as they occur new_branches.append({'distance' : np.array([distance]), 'branching_level' : np.array([level]), 'new_leaf_id' : np.array(assignment), 'image' : image,}) # we have made a new branch so we don't need to # continue this loop break # save the walker assignment self._walker_assignments.append(assignment) self._walker_weights.append(walker.weight) # go back through the nodes in this walker's branch # increase the n_walkers for each node, and save the # walkers (index in self.walker_assignments) it has, and # save increase the number above pmin if valid for level in range(len(assignment) + 1): node_id = assignment[:level] self.node[node_id]['n_walkers'] += 1 self.node[node_id]['walker_idxs'].append(walker_idx) # We also want to find out some details about the ability of # the leaf nodes to clone and merge walkers. This is useful # for being able to balance the tree. Once this has been # figured out for the leaf nodes we want to aggregate these # numbers for the higher level regions for node_id in self.leaf_nodes(): leaf_walker_idxs = self.node[node_id]['walker_idxs'] leaf_weights = [self.walker_weights[i] for i in leaf_walker_idxs] # first figure out how many walkers are squashable (AKA # reducible) n_squashable = self._calc_squashable_walkers(leaf_weights) # get the max number of clones for each walker and sum # them up to get the total number of cloneable walkers walker_max_n_clones = [self._calc_max_num_clones(walker_weight) for walker_weight in leaf_weights] n_possible_clones = sum(walker_max_n_clones) # actually set them as attributes for the node self.node[node_id]['n_squashable'] = n_squashable self.node[node_id]['n_possible_clones'] = n_possible_clones # also add this amount to all of the nodes above it # n_squashable for level in reversed(range(self.n_levels)): branch_node_id = node_id[:level] self.node[branch_node_id]['n_squashable'] += n_squashable # n_posssible_clones for level in reversed(range(self.n_levels)): branch_node_id = node_id[:level] self.node[branch_node_id]['n_possible_clones'] += n_possible_clones return new_branches @classmethod def _max_n_merges(cls, pmax, root, weights): # indices of the weights walker_idxs = [i for i, weight in enumerate(weights)] # remove the root from the weights unused_walker_idxs = list(set(walker_idxs).difference(root)) # initialize the number of merges identified by the length of # the current root max_n_merges = len(root) - 1 # then combine the root with the unused weights for root, merge_candidate in it.product([root], unused_walker_idxs): # get the weights for this combo combo_weights = [weights[i] for i in root] + [weights[merge_candidate]] # sum them sum_weight = sum(combo_weights) # if the sum of the weights is less than or equal than the # pmax then this combination beats the current record of # the root if sum_weight <= pmax: # then we know that the number of merges is at least # one more than the root max_n_merges += 1 # if we still haven't reached the pmax continue making # merges to see if we can beat this record if sum_weight < pmax: # make a new root for this combo and recursively call # this method new_combo = (*root, merge_candidate,) # this will return the maximum number of merges from # this subset of the walkers n_merges = cls._max_n_merges(pmax, new_combo, weights) # if this is greater than the current record # overwrite it if n_merges > max_n_merges: max_n_merges = n_merges # if it is exactly pmax then no more merges can be # done so we can just end here and return this record elif sum_weight == pmax: break # if no combination of this root and other candidates can make # any more merges than we just return the roots number of merges return max_n_merges def _calc_squashable_walkers(self, walker_weights): if self.merge_method == 'single': n_squashable = calc_squashable_walkers_single_method(walker_weights, self.pmax) else: raise ValueError("merge method {} not recognized".format(self.merge_method)) return n_squashable def _calc_max_num_clones(self, walker_weight): return calc_max_num_clones(walker_weight, self.pmin, self.max_num_walkers) def _propagate_and_balance_shares(self, parental_balance, children_node_ids): # talk about "shares" which basically are the number of # slots/replicas that will be allocated to this region for # running sampling on # we get the current number of shares for each child orig_children_shares = {child_id : len(self.node[child_id]['walker_idxs']) for child_id in children_node_ids} # the copy to use as a tally of the shares children_shares = copy(orig_children_shares) # the donatable (squashable) walkers to start with children_donatable_shares = {child_id : self.node[child_id]['n_squashable'] for child_id in children_node_ids} # the donatable (squashable) walkers to start with children_receivable_shares = {child_id : self.node[child_id]['n_possible_clones'] for child_id in children_node_ids} # Our first goal in this subroutine is to dispense a parental # balance to it's children in a simply valid manner children_dispensations = self._dispense_parental_shares( parental_balance, children_shares, children_donatable_shares, children_receivable_shares) for child_id, dispensation in children_dispensations.items(): # update the shares, donatables, and receivables which we # will then balance between regions children_shares[child_id] += dispensation # add the dispensation to the number of the donatable # shares children_donatable_shares[child_id] += dispensation # subtract the dispensation from the number of receivable # shares children_receivable_shares[child_id] -= dispensation # Now that we have dispensed the shares to the children in a # valid way we use an algorithm to now distribute the shares # between the regions as evenly as possible children_shares = self._balance_children_shares(children_shares, children_donatable_shares, children_receivable_shares) # calculate the net change in the balances for each region net_balances = {child_id : children_shares[child_id] - orig_children_shares[child_id] for child_id in children_shares.keys()} children_balances = [balance for balance in net_balances.values()] if sum(children_balances) != parental_balance: raise RegionTreeError( "The balances of the child nodes ({}) do not balance to the parental balance ({})".format( children_balances, parental_balance)) # no state changes to the object have been made up until this # point, but now that the net change in the balances for the # children have been generated we set them into their nodes for child_node_id, child_net_balance in net_balances.items(): self.node[child_node_id]['balance'] = child_net_balance def _dispense_parental_shares(self, parental_balance, children_shares, children_donatable_shares, children_receivable_shares): """Given a parental balance and a set of children nodes, we dispense the shares indicated by the balance to the children nodes in a VALID but not necessarily optimal or desirable way. This merely checks for the hard constraints on the number of shares a region can either give or receive based on their capacity to clone and merge walkers. An additional balancing step can be performed to redistribute them. """ # this will be the totaled up dispensations for each child # region children_dispensations = {child_id : 0 for child_id in children_shares.keys()} # if there is only one child it just inherits all of the # balance no matter what if len(children_shares.keys()) == 1: child_node_id = list(children_shares.keys())[0] # we put the shares for this only child in a dictionary # like the other methods would produce children_dispensations[child_node_id] = parental_balance # there are more than one child so we accredit balances # between them elif len(children_shares.keys()) > 1: # if the parent has a non-zero balance we either # increase (clone) or decrease (merge) the balance # these poor children are inheriting a debt and must # decrease the total number of their shares :( if parental_balance < 0: children_dispensations = self._dispense_debit_shares(parental_balance, children_shares, children_donatable_shares) # these lucky children are inheriting a positive number of # shares!! :) elif parental_balance > 0: children_dispensations = self._dispense_credit_shares(parental_balance, children_shares, children_receivable_shares) else: raise RegionTreeError("no children nodes to give parental balance") return children_dispensations def _dispense_debit_shares(self, parental_balance, children_shares, children_donatable_shares): """For a negative parental balance we dispense it to the children nodes""" children_donatable_shares = copy(children_donatable_shares) children_dispensations = {child_id : 0 for child_id in children_shares.keys()} # list of the keys so we can iterate through them children_node_ids = list(children_shares.keys()) # dispense the negative shares as quickly as possible, # they will be balanced later child_iter = iter(children_node_ids) remaining_balance = parental_balance while remaining_balance < 0: # get the node id try: child_node_id = next(child_iter) except StopIteration: # if the parental balance is still not zero and there # are no more children then the children cannot # balance it given their constraints and there is an # error raise RegionTreeError("Children cannot pay their parent's debt") n_donatable = children_donatable_shares[child_node_id] # if this child has any squashable walkers if n_donatable > 0: # we use those for paying the parent's debt # the amount of the parental debt that can be # paid (the payment) for this child region is # either the number of squashable walkers or # the absolute value of the parental balance # (since it is negative for debts), whichever # is smaller payment = min(n_donatable, abs(remaining_balance)) # take this from the remaining balance remaining_balance += payment # and take it away from the childs due balance and shares children_dispensations[child_node_id] -= payment # also account for that in its donatable shares children_donatable_shares[child_node_id] -= payment # double check the balance is precisely 0, we want to # dispense all the shares as well as not accidentally # overdispensing assert remaining_balance == 0, "balance is not 0" return children_dispensations def _dispense_credit_shares(self, parental_balance, children_shares, children_receivable_shares): children_receivable_shares = copy(children_receivable_shares) children_dispensations = {child_id : 0 for child_id in children_shares.keys()} # list of the keys so we can iterate through them children_node_ids = list(children_shares.keys()) # dispense the shares to the able children as quickly # as possible, they will be redistributed in the next # step child_iter = iter(children_node_ids) remaining_balance = parental_balance while remaining_balance > 0: # get the node id try: child_node_id = next(child_iter) except StopIteration: # if the parental balance is still not zero and there # are no more children then the children cannot # balance it given their constraints and there is an # error raise RegionTreeError("Children cannot accept their parent's credit") # give as much of the parental balance as we can # to the walkers. In the next step all this # balance will be shared among the children so all # we need to do is dispense all the shares without # care as to who gets them, as long as they can # keep it n_receivable = children_receivable_shares[child_node_id] # the amount to be disbursed to this region is # either the number of possible clones (the # maximum it can receive) or the full parental # balance, whichever is smaller disbursement = min(n_receivable, abs(remaining_balance)) # give this disbursement by taking away from the # positive balance remaining_balance -= disbursement # add these shares to the net balances and share # totals children_dispensations[child_node_id] += disbursement # also subtract this from the number of receivable shares # for the child children_receivable_shares[child_node_id] -= disbursement # double check the balance is precisely 0, we want to # dispense all the shares as well as not accidentally # overdispensing assert remaining_balance == 0, "balance is not 0" return children_dispensations def _balance_children_shares(self, children_shares, children_donatable_shares, children_receivable_shares): """Given a dictionary mapping the child node_ids to the total number of shares they currently hold we balance between them in order to get an even distribution of the shares as possible. """ children_shares = copy(children_shares) # generate the actual donation pair and the amount that should # be donated for the best outcome donor_node_id, acceptor_node_id, donation_amount = \ self._gen_best_donation(children_shares, children_donatable_shares, children_receivable_shares) # if the donation amount is zero we make no donation if donation_amount > 0: # account for this donation in the shares children_shares[donor_node_id] -= donation_amount children_shares[acceptor_node_id] += donation_amount # subtract the donation donatable_shares from the donor # and add the donation to the donatable_shares of the # acceptor children_donatable_shares[donor_node_id] -= donation_amount children_donatable_shares[acceptor_node_id] += donation_amount # do the opposite to the receivable shares children_receivable_shares[donor_node_id] += donation_amount children_receivable_shares[acceptor_node_id] -= donation_amount # we have decided the first donation, however more will be # performed as long as the amount of the donation is either 0 # or that two donations of only 1 share occur twice in a # row. The former occurs in scenarios when there is an even # balance and the latter in an odd scenario and the last odd # share would get passed back and forth # we keep track of the previous donation, and initialize it to # None for now previous_donation_amount = donation_amount while (donation_amount > 0) and \ not (previous_donation_amount == 1 and donation_amount == 1): # update the previous donation amount previous_donation_amount = donation_amount # get the next best donation donor_node_id, acceptor_node_id, donation_amount = \ self._gen_best_donation(children_shares, children_donatable_shares, children_receivable_shares) # if there is a donation to be made make it if donation_amount > 0: # account for this donation in the shares children_shares[donor_node_id] -= donation_amount children_shares[acceptor_node_id] += donation_amount # subtract the donation donatable_shares from the donor # and add the donation to the donatable_shares of the # acceptor children_donatable_shares[donor_node_id] -= donation_amount children_donatable_shares[acceptor_node_id] += donation_amount # do the opposite to the receivable shares children_receivable_shares[donor_node_id] += donation_amount children_receivable_shares[acceptor_node_id] -= donation_amount return children_shares def _gen_best_donation(self, children_shares, children_donatable_shares, children_receivable_shares): """Given a the children shares generate the best donation. Returns the donor_node_id the acceptor_node_id and the donation that should be done between them and that will be guaranteed to be valid. (this is done by checking the attributes of the regions node however, no changes to node state are performed) returns donor_node_id, acceptor_node_id, donation_amount """ # to find the best possible donation we would like to give # shares from the region with the most to the region with the # least and give as many as possible that will equalize them, # however the size of a donation is dependent not only on the # amount of shares each region has but also the number of # squashable walkers and the number of possible clones that # satisfy the maximum and minimum walker weight # constraints. These are given by the # children_donatable_shares and children_receivable_shares. We # use arguments instead of accessing the attributes of the # object because so this can be done in an iterative manner # before modifying the node attributes. # default values for the results best_pair = (None, None) best_donation_amount = 0 # if there are not enough children regions to acutally make # pairings between then we just return the default negative # result if len(children_shares) < 2: donor_node_id, acceptor_node_id = best_pair return donor_node_id, acceptor_node_id, best_donation_amount # we want to get the list of pairings where each pairing is # (donor, acceptor) pairings = [] for a, b in it.combinations(children_shares.keys(), 2): # find the largest difference comparing (a,b) and (b,a), # this will give the donor, acceptor pair permutations = [(a,b), (b,a)] perm_idx = np.argmax([children_shares[i] - children_shares[j] for i, j in permutations]) donor_acceptor_pair = permutations[perm_idx] pairings.append(donor_acceptor_pair) # to find the best match we first calculate the differences in # the number of shares for each possible pairing between # children shares pairings_differences = [children_shares[donor_id] - children_shares[acceptor_id] for donor_id, acceptor_id in pairings] pairings_donations = [] # then we find all the non-zero pairings for i, difference in enumerate(pairings_differences): # if there is a positive difference then we calculate what # the largest donation would be if difference > 0: donor_node_id, acceptor_node_id = pairings[i] # get the total numbers of shares for each donor_n_shares = children_shares[donor_node_id] acceptor_n_shares = children_shares[acceptor_node_id] # as well as the donatable and receivable shares donor_donatable_shares = children_donatable_shares[donor_node_id] acceptor_receivable_shares = children_receivable_shares[acceptor_node_id] # actually calculate the maximum donation donation_amount = self._calc_share_donation(donor_n_shares, acceptor_n_shares, donor_donatable_shares, acceptor_receivable_shares) pairings_donations.append(donation_amount) # if there is no difference then the donation amount will also be zero else: pairings_donations.append(0) # now we can zip them all together and then sort them such # that we first sort on the size of the number of shares and # then on the size of the donation pair_values = list(zip(pairings_differences, pairings_donations, pairings)) pair_values.sort() # largest to smallest pair_values.reverse() # now we take the pairing that has the highest difference and # has a nonzero donation size. Note there may be other # pairings with the same numbers that will just be ignored. pair_iter = iter(pair_values) # loop through until the first best donation is found. # get the first pairing try: diff, donation, pairing = next(pair_iter) except StopIteration: raise RegionTreeError("No pairings to make donations between") done = False while not done: # since the pair_values are sorted first by the total diff # and then the donation size, the first one that has a # positive donation is the best donation if donation > 0: best_pair = pairing best_donation_amount = donation done = True # try to get the next pairing if there is one try: diff, donation, pairing = next(pair_iter) except StopIteration: # we are done, use the last pairing we had as the # pair, its just as good as any of the others let the # calling method decide what to do in this situation best_pair = pairing break # now we have the best donation and the pair is already in the # donor, acceptor order from when we created it donor_node_id, acceptor_node_id = best_pair return donor_node_id, acceptor_node_id, best_donation_amount def _find_best_donation_pair(self, children_donatable_shares, children_receivable_shares): """This method just returns which children have the most and least number of 'shares' which are the effective number of walker slots it will be granted in the next segment of dynamics in the simulation. This is essentially the amount of sampling effort that will be allocated to this region. This method is give the dictionary of the childrens """ # one region will be the donor of shares donor_child_node_id = None # the other will accept them acceptor_child_node_id = None # record for maximum number of donateable shares max_donatable_shares = None # records for the max and min number of shares of the acceptor # and donor regions donor_n_shares = None acceptor_n_shares = None # go through every walker and test it to see if it is either # the highest or lowest, record it if it is for child_node_id in children_donatable_shares.keys(): # the number of donatable shares is equal to the number # of squashable walkers n_donatable_shares = children_donatable_shares[child_node_id] # the number of possible shares this node can receive is # equal to the number of possible clones it can make n_receivable_shares = children_receivable_shares[child_node_id] # we see if this node region is the max region by testing # if it is the new highest in shares. It must also be able # to donate a share by having at least 1 squashable walker if ((donor_child_node_id is None) or (n_shares > donor_n_shares)) and \ (n_donatable_shares > 0): # this is a new record max_donatable_shares = n_donatable_shares # save how many shares this region has in total donor_n_shares = n_shares donor_child_node_id = child_node_id # test if this is the region with the lowest number of # shares that is still able to receive at least one share if ((acceptor_child_node_id is None) or (n_shares < acceptor_n_shares)) and \ (n_receivable_shares > 0): acceptor_n_shares = n_shares acceptor_child_node_id = child_node_id # check that both a donor and acceptor were identified and # that values for there shares were given assert all([True if val is not None else False for val in [donor_n_shares, acceptor_n_shares, donor_child_node_id, acceptor_child_node_id]]), \ "A donor or acceptor was not found" # if the acceptor's number of shares is not less then the # donor then there is not possible donation if acceptor_n_shares >= donor_n_shares: return False # if there is a net donation we return the donor and acceptor else: return donor_child_node_id, acceptor_child_node_id def _calc_share_donation(self, donor_n_shares, acceptor_n_shares, donor_donatable_shares, acceptor_receivable_shares): # the sibling with the greater number of shares (from both # previous resamplings and inherited from the parent) will # give shares to the sibling with the least. # To decide how many it shall give we first propose a desired # donation that will make them the most similar, rounding down # (i.e. midpoint) desired_donation = math.floor((donor_n_shares - acceptor_n_shares)/2) # however, the donor only has a certain capability of donation # and the acceptor has a certain capacity of receiving. Out of # the three we can only actually donate the smallest amount actual_donation = min(desired_donation, donor_donatable_shares, acceptor_receivable_shares) return actual_donation def _decide_merge_leaf(self, leaf, merge_groups): # this method assumes no cloning has been performed before this # TODO: potentially unneeded # all the walker idxs walker_idxs = list(range(len(merge_groups))) # the balance of this leaf leaf_balance = self.node[leaf]['balance'] # there should not be any taken walkers in this leaf since a # leaf should only have this method run for it once during # decision making, so the mergeable walkers are just all the # walkers in this leaf leaf_walker_idxs = self.node[leaf]['walker_idxs'] leaf_walker_weights = [self.walker_weights[walker_idx] for walker_idx in leaf_walker_idxs] # now that we have the walkers that may potentially be merged # we need to actually find a set of groupings that satisfies # the reduction in balance without any individual walker # exceeding the maximum weight. In general this is a difficult # problem both here and in deciding how balances are # distributed (because the potential merges determine a leafs # ability to pay a portion of a debt from a higher level in # the region tree). # currently we avoid this general problem (potentially of the # backpack kind if you want to help solve this issue) and # simply assume that we will perform a single merge of the # walkers of the lowest weights to achieve our balance # reduction goal. As long as this assumption holds in how the # balances are determined this will succeed, if not this will # fail # to allow for easy improvements later on pending this problem # becoming solved it is functionalized here to make a set of # pairings that satisfy the balance reduction goal, these are # "merge groups" except that at this point we haven't chosen # one to be the KEEP_MERGE walker and have its state # retained. This will be decided further on. So these "full # merge groups" include all the walkers that will be merged # and the sum of their weights will be the weight of the final # merged walker and should satisfy the maximum weight # requirement, i.e. it will not be checked here. full_merge_groups_leaf_walker_idxs = \ self._solve_merge_groupings(leaf_walker_weights, leaf_balance) # now we go through each of these "full merge groups" and make # the "merge groups". Pardon the terminology, but the # distinction is trivial and is only relevant to the # implementation. The "merge groups" are what is returned. To # make them we just choose which walker to keep and which # walkers to squash in each full merge group for full_merge_group_leaf_walker_idxs in full_merge_groups_leaf_walker_idxs: # the indices from this are in terms of the list of weights # given to the method so we translate them back to the actual # walker indices chosen_walker_idxs = [leaf_walker_idxs[leaf_walker_idx] for leaf_walker_idx in full_merge_group_leaf_walker_idxs] # get the weights of these chosen_walker_idxs chosen_weights = [self.walker_weights[walker_idx] for walker_idx in chosen_walker_idxs] # choose the one to keep the state of (e.g. KEEP_MERGE # in the Decision) based on their weights # normalize weights to the sum of all the chosen weights chosen_pdist = np.array(chosen_weights) / sum(chosen_weights) # then choose one of the the walker idxs to keep according to # their normalized weights keep_walker_idx = np.random.choice(chosen_walker_idxs, 1, p=chosen_pdist)[0] # pop the keep idx from the walkers so we can use the rest of # them as the squash idxs chosen_walker_idxs.pop(chosen_walker_idxs.index(keep_walker_idx)) # the rest are squash_idxs squash_walker_idxs = chosen_walker_idxs # update the merge group based on this decision merge_groups[keep_walker_idx].extend(squash_walker_idxs) return merge_groups def _solve_merge_groupings(self, walker_weights, balance): # this method chooses between the methods for solving the # backpack problem of how to merge walkers together to satisfy # a goal # as a method for easy transition between potential methods (I # expect there are multiple solutions to the problem with # different tradeoffs that will want to be tested) a method # can be chosen when the region tree is created and a constant # string identifier will be set indicating which method is in use # so we use that string to find which method to use if self.merge_method == 'single': full_merge_groups, result = decide_merge_groups_single_method( walker_weights, balance, self.pmax) else: raise ValueError("merge method {} not recognized".format(self.merge_method)) # if the result came out false then a solution could not be # found if not result: raise RegionTreeError( "A solution to the merging problem could not be found given the constraints") else: return full_merge_groups def _decide_clone_leaf(self, leaf, merge_groups, walkers_num_clones): # just follow the instructions in the walkers_num_clones and # find them slots # this assumes that the all squashes have already been # specified in the merge group, this is so that we can use # unused walker slots. # if this leaf node was assigned a debt we need to merge # walkers leaf_balance = self.node[leaf]['balance'] leaf_walker_idxs = self.node[leaf]['walker_idxs'] leaf_walker_weights = {walker_idx : self.walker_weights[walker_idx] for walker_idx in self.node[leaf]['walker_idxs']} # calculate the maximum possible number of clones each free walker # could produce walker_n_possible_clones = {walker_idx : self._calc_max_num_clones(leaf_weight) for walker_idx, leaf_weight in leaf_walker_weights.items()} # the sum of the possible clones needs to be greater than or # equal to the balance if not sum(walker_n_possible_clones.values()) >= leaf_balance: raise RegionTreeError("there isn't enough clones possible to pay the balance") # go through the list of free walkers and see which ones have # any possible clones and make a list of them cloneable_walker_idxs = [walker_idx for walker_idx in leaf_walker_idxs if walker_n_possible_clones[walker_idx] > 0] cloneable_walker_weights = [leaf_walker_weights[walker_idx] for walker_idx in cloneable_walker_idxs] # if the sum of them is equal to the leaf balance then we # don't have to optimally distribute them and we just give them out if sum(walker_n_possible_clones.values()) == leaf_balance: for walker_idx in cloneable_walker_idxs: walkers_num_clones[walker_idx] = walker_n_possible_clones[walker_idx] # return this without doing all the prioritization return walkers_num_clones # otherwise we want to optimally distribute the clones to the # walkers such that we split the largest walkers first # to distribute the clones we iteratively choose the walker # with the highest weight after a single clone # weight/(n_clones +1) where n_clones is the current number of # clones assigned to it (plus itself), and then add another # clone to it as long as it is within the range of the number # of clones it can make # go until the balance is paid off clones_left = leaf_balance while clones_left > 0: # determine which walkers are still in the running for # receiving clones still_cloneable_walker_idxs = [] still_cloneable_walker_weights = [] for walker_idx, weight in zip(cloneable_walker_idxs, cloneable_walker_weights): # if the number of clones is less than its maximum # possible add it to the still applicable ones if (walkers_num_clones[walker_idx] < walker_n_possible_clones[walker_idx]): still_cloneable_walker_idxs.append(walker_idx) still_cloneable_walker_weights.append(weight) # if there is only one applicable walker left give it the # rest of the balance and return if len(still_cloneable_walker_idxs) == 1: walkers_num_clones[still_cloneable_walker_idxs[0]] += clones_left clones_left -= clones_left # end this loop iteration, skipping the decision part continue # if there are multiple walkers left we decide between them # calculate the weights of the walker's children given the # current number of clones, if num_clones is 0 then it is # just it's own weight child_weights = [] for walker_idx, weight in zip(still_cloneable_walker_idxs, still_cloneable_walker_weights): # the weight of its children given the number of # clones already assigned to it child_weight = weight / (walkers_num_clones[walker_idx]+1) child_weights.append(child_weight) # get the walker_idx with the highest would-be child weight chosen_walker_idx = still_cloneable_walker_idxs[np.argsort(child_weights)[-1]] # add a clone to it walkers_num_clones[chosen_walker_idx] += 1 # we are one step closer to satisfying the cloning # requirement clones_left -= 1 return walkers_num_clones def _decide_settle_balance(self): """Given the balances of all the leaves figure out actually how to settle all the balances. Returns the merge_groups and walkers_num_clones """ # initialize the main data structures for specifying how to # merge and clone a set of walkers. These will be modified for # clones and merges but in the initialized state they will # generate all NOTHING records. # the merge groups, a list of lists where the elements of the # outer list are individual "merge groups" that themselves # contain the elements of the walkers that will be squashed in # a merge and the index of the merge group in the outer list # is the index of the walker that will be kept # (i.e. KEEP_MERGE and have its state persist to the next # step). Indices appearing in any merge group can themselves # not have a merge group merge_groups = [[] for i in self.walker_weights] # the number of clones to make for each walker. Simply a list # of 0 or positive integers that specify how many EXTRA clones # will be made. E.g. if a cloned walker is to have 3 children # then the number of clones is 2. We consider clones copies # from the original walker which is given by the index in the # list. This number then gives the number of new slots needed # for a cloning event walkers_num_clones = [0 for i in self.walker_weights] # get all the leaf balances leaf_nodes = self.leaf_nodes() leaf_balances = [self.node[leaf]['balance'] for leaf in leaf_nodes] # get the negative and positive balanced leaves neg_leaves = [leaf_nodes[leaf_idx[0]] for leaf_idx in np.argwhere(np.array(leaf_balances) < 0)] pos_leaves = [leaf_nodes[leaf_idx[0]] for leaf_idx in np.argwhere(np.array(leaf_balances) > 0)] # we decide on how the walkers will be cloned and # merged. These steps are purely functional and do not modify # any attributes on the RegionTree. The merge_groups and # walkers_num_clones can be used to commit these changes # elsewhere if desired. # first do all leaves with negative balances, so that after we # have freed up slots we can fill them with clones since in # WEXplore we want to have an economy of slots and not create # them if we don't have to for leaf in neg_leaves: merge_groups = self._decide_merge_leaf(leaf, merge_groups) # then do all the leaves with positive balances to fill the # slots left from squashing walkers for leaf in pos_leaves: walkers_num_clones = self._decide_clone_leaf(leaf, merge_groups, walkers_num_clones) return merge_groups, walkers_num_clones def _check_clone_merge_specs(self, merge_groups, walkers_num_clones): """This will perform the computations to get the weights of the clones and merges but does not actually assign them to slots. This is mainly for checking that we have not violated any rules. """ # keep a dictionary of all the walkers that will be parents to # at least one child walker and make a list of the weights # what each of the children will be walker_children_weights = defaultdict(list) # walkers that will be keep merges keep_merge_walker_idxs = [] # walkers that will parents of clones clone_parent_walker_idxs = [] # walkers that do nothing and keep their state and weight nothing_walker_idxs = [] # if that passes then we can check whether or not the weights make sense new_walker_weights = [] # get all the squash idxs so we can pass over them all_squash_idxs = list(it.chain(*merge_groups)) # go through each walker and see what the results of it would # be without assigning it to anywhere in particular for walker_idx, num_clones in enumerate(walkers_num_clones): # check that clones are not performed on KEEP_MERGE and SQUASH # walkers if num_clones > 0: if len(merge_groups[walker_idx]) > 0: raise ResamplerError("trying to clone a KEEP_MERGE walker") squash_idxs = list(it.chain(merge_groups)) if walker_idx in squash_idxs: raise ResamplerError("trying to clone a SQUASH walker") squash_walker_idxs = merge_groups[walker_idx] # if it is a squashed walker ignore it if walker_idx in all_squash_idxs: pass # split the weight up evenly, the numbers in the list is # the extra number of walkers that should exist so that we # should add 1 to get the total number of child walkers # after the split elif num_clones > 0 and len(squash_walker_idxs) == 0: # add this to the list of clone parents clone_parent_walker_idxs.append(walker_idx) # get the weight each of the children will have clone_weights = self._walker_weights[walker_idx] / (num_clones + 1) # add them to the new_walker_weights and as the # children of this walkers weights for i in range(num_clones + 1): # weights of all walkers new_walker_weights.append(clone_weights) # weights of the children walker_children_weights[walker_idx].append(clone_weights) # if this is a merge group keep idx then we add the # weights of the merge group together elif len(squash_walker_idxs) > 0: # add this to the list of keep merge parents keep_merge_walker_idxs.append(walker_idx) # add up the weights of the squashed walkers squashed_weight = sum([self.walker_weights[i] for i in squash_walker_idxs]) # add them to the weight for the keep walker walker_weight = self._walker_weights[walker_idx] + squashed_weight new_walker_weights.append(walker_weight) walker_children_weights[walker_idx].append(walker_weight) # this is then a nothing instruction so we just add its # weight to the list as is else: nothing_walker_idxs.append(walker_idx) new_walker_weights.append(self._walker_weights[walker_idx]) walker_children_weights[walker_idx].append(self._walker_weights[walker_idx]) # check that we have the same number of walkers as when we started if not len(new_walker_weights) == len(self._walker_weights): raise ResamplerError("There is not the same number of walkers as before the clone-merges") # then we check that the total weight before and after is the # same or close to the same if not np.isclose(sum(self._walker_weights), sum(new_walker_weights)): raise ResamplerError("There has been a change in total amount of weight") # check that none of the walkers are outside the range of # probabilities new_walker_weights = np.array(new_walker_weights) overweight_walker_idxs = np.where(new_walker_weights > self.pmax)[0] # check that all of the weights are less than or equal to the pmax if len(overweight_walker_idxs > 0): # list of parents that produce overweight children overweight_producer_idxs = [] # figure out which parent created them, this will have # come from a merge so we just go through the parents that # are keep merges for keep_merge_walker_idx in keep_merge_walker_idxs: child_weight = walker_children_weights[keep_merge_walker_idx][0] if child_weight >= self.pmax: overweight_producer_idxs.append(keep_merge_walker_idx) raise ResamplerError( "Merge specs produce overweight walkers for merge groups {}".format( [str(i) for i in overweight_producer_idxs])) # check that all of the weights are less than or equal to the pmin underweight_walker_idxs = np.where(new_walker_weights < self.pmin)[0] if len(underweight_walker_idxs > 0): # list of clone parents that will produce underweight # walkers underweight_producer_idxs = [] # figure out which parents create underweight walkers, # only clones will do this so we just look through them for clone_parent_walker_idx in clone_parent_walker_idxs: # all children will be the same weight so we just get # one of the weights child_weight = walker_children_weights[clone_parent_walker_idx][0] if child_weight <= self.pmin: underweight_producer_idxs.append(clone_parent_walker_idx) raise ResamplerError( "Clone specs produce underweight walkers for clone walkers {}".format( [str(i) for i in underweight_producer_idxs])) def balance_tree(self, delta_walkers=0): """Do balancing between the branches of the tree. the `delta_walkers` kwarg can be used to increase or decrease the total number of walkers, but defaults to zero which will cause no net change in the number of walkers. """ # set the delta walkers to the balance of the root node self.node[self.ROOT_NODE]['balance'] = delta_walkers # do a breadth first traversal and balance at each level for parent, children in nx.bfs_successors(self, self.ROOT_NODE): # pass on the balance of this parent to the children from the # parents, distribute walkers between parental_balance = self.node[parent]['balance'] # this will both propagate the balance set for the root # walker down the tree and balance between the children self._propagate_and_balance_shares(parental_balance, children) # check that the sum of the balances of the leaf nodes # balances to delta_walkers leaf_balances = [self.node[leaf]['balance'] for leaf in self.leaf_nodes()] if sum(leaf_balances) != delta_walkers: raise RegionTreeError( "The balances of the leaf nodes ({}) do not balance to delta_walkers ({})".format( leaf_balances, delta_walkers)) # decide on how to settle all the balances between leaves merge_groups, walkers_num_clones = self._decide_settle_balance() # count up the number of clones and merges in the merge_groups # and the walkers_num_clones num_clones = sum(walkers_num_clones) num_squashed = sum([len(merge_group) for merge_group in merge_groups]) # check that the number of clones and number of squashed # walkers balance to the delta_walkers amount if num_clones - num_squashed != delta_walkers: raise RegionTreeError("The number of new clones ({}) is not balanced by the number of" "squashed walkers ({}) to the delta_walkers specified ({})".format( num_clones, num_squashed, delta_walkers)) # DEBUG # check the merge groups and walkers_num_clones to make sure # they are valid try: self._check_clone_merge_specs(merge_groups, walkers_num_clones) except ResamplerError as resampler_err: print(resampler_err) import ipdb; ipdb.set_trace() return merge_groups, walkers_num_clones class WExploreResampler(Resampler): DECISION = MultiCloneMergeDecision # datatype for the state change records of the resampler, here # that is the defnition of a new branch of the region tree, the # value is the level of the tree that is branched. Most of the # useful information will be in the auxiliary data, like the # image, distance the walker was away from the image at that # level, and the id of the leaf node RESAMPLER_FIELDS = ('branching_level', 'distance', 'new_leaf_id', 'image') RESAMPLER_SHAPES = ((1,), (1,), Ellipsis, Ellipsis) RESAMPLER_DTYPES = (np.int, np.float, np.int, None) # fields that can be used for a table like representation RESAMPLER_RECORD_FIELDS = ('branching_level', 'distance', 'new_leaf_id') # fields for resampling data RESAMPLING_FIELDS = DECISION.FIELDS + ('step_idx', 'walker_idx', 'region_assignment',) RESAMPLING_SHAPES = DECISION.SHAPES + ((1,), (1,), Ellipsis,) RESAMPLING_DTYPES = DECISION.DTYPES + (np.int, np.int, np.int,) # fields that can be used for a table like representation RESAMPLING_RECORD_FIELDS = DECISION.RECORD_FIELDS + \ ('step_idx', 'walker_idx', 'region_assignment',) def __init__(self, seed=None, pmin=1e-12, pmax=0.1, distance=None, max_n_regions=(10, 10, 10, 10), max_region_sizes=(1, 0.5, 0.35, 0.25), init_state=None, **kwargs ): # we call the common methods in the Resampler superclass. We # set the min and max number of walkers to be constant super().__init__(min_num_walkers=Ellipsis, max_num_walkers=Ellipsis, **kwargs) assert distance is not None, "Distance object must be given." assert init_state is not None, "An initial state must be given." self.decision = self.DECISION # the region tree which keeps track of the regions and can be # balanced for cloning and merging between them, is # initialized the first time resample is called because it # needs an initial walker self._region_tree = None # parameters self.pmin=pmin self.pmax=pmax self.seed = seed if self.seed is not None: rand.seed(self.seed) self.max_n_regions = max_n_regions self.n_levels = len(max_n_regions) self.max_region_sizes = max_region_sizes # in nanometers! # distance metric self.distance = distance # we do not know the shape and dtype of the images until # runtime so we determine them here image = self.distance.image(init_state) self.image_shape = image.shape self.image_dtype = image.dtype # initialize the region tree with the first state self._region_tree = RegionTree(init_state, max_n_regions=self.max_n_regions, max_region_sizes=self.max_region_sizes, distance=self.distance, pmin=self.pmin, pmax=self.pmax) def resampler_field_shapes(self): # index of the image idx image_idx = self.resampler_field_names().index('image') # shapes adding the image shape shapes = list(super().resampler_field_shapes()) shapes[image_idx] = self.image_shape return tuple(shapes) def resampler_field_dtypes(self): # index of the image idx image_idx = self.resampler_field_names().index('image') # dtypes adding the image dtype dtypes = list(super().resampler_field_dtypes()) dtypes[image_idx] = self.image_dtype return tuple(dtypes) # override the superclass methods to utilize the decision class def resampling_field_names(self): return self.RESAMPLING_FIELDS def resampling_field_shapes(self): return self.RESAMPLING_SHAPES def resampling_field_dtypes(self): return self.RESAMPLING_DTYPES def resampling_fields(self): return list(zip(self.resampling_field_names(), self.resampling_field_shapes(), self.resampling_field_dtypes())) @property def region_tree(self): return self._region_tree def assign(self, walkers): ## Assign the walkers based on the current defined Voronoi ## images which assign them to bins/leaf-nodes, possibly ## creating new regions, do this by calling the method to ## "place_walkers" on the tree which changes the tree's state new_branches = self.region_tree.place_walkers(walkers) # data records about changes to the resampler, here is just # the new branches data resampler_data = new_branches # the assignments assignments = np.array(self.region_tree.walker_assignments) # return the assignments and the resampler records of changed # resampler state, which is addition of new regions return assignments, resampler_data def decide(self, delta_walkers=0): """ Make decisions for resampling for a single step. """ ## Given the assignments (which are on the tree nodes) decide ## on which to merge and clone # do this by "balancing" the tree. delta_walkers can be # specified to increase or decrease the total number of # walkers merge_groups, walkers_num_clones = \ self.region_tree.balance_tree(delta_walkers=delta_walkers) logging.info("merge_groups\n{}".format(merge_groups)) logging.info("Walker number of clones\n{}".format(walkers_num_clones)) logging.info("Walker assignments\n{}".format(self.region_tree.walker_assignments)) logging.info("Walker weights\n{}".format(self.region_tree.walker_weights)) # check to make sure we have selected appropriate walkers to clone logging.info("images_assignments\n{}".format(self.region_tree.regions)) # take the specs for cloning and merging and generate the # actual resampling actions (instructions) for each walker, # this does not change the state of the resampler or region # tree resampling_actions = self.assign_clones(merge_groups, walkers_num_clones) if self.is_debug_on: # check that the actions were performed correctly try: self._check_resampling_data(resampling_actions) except ResamplerError as resampler_err: print(resampler_err) import ipdb; ipdb.set_trace() # add the walker_idx to the records to be returned for walker_idx, walker_record in enumerate(resampling_actions): walker_record['walker_idx'] = walker_idx return resampling_actions @staticmethod def _check_resampling_data(resampling_data): # in WExplore we don't increase or decrease the number of # walkers and thus all slots must be filled so we go through # each decision that targets slots in the next stop and # collect all of those n_slots = len(resampling_data) taken_slot_idxs = [] squash_slot_idxs = [] keep_merge_slot_idxs = [] for rec_d in resampling_data: if rec_d['decision_id'] in (1, 2, 4): taken_slot_idxs.extend(rec_d['target_idxs']) if rec_d['decision_id'] == 3: squash_slot_idxs.extend(rec_d['target_idxs']) if rec_d['decision_id'] == 4: keep_merge_slot_idxs.extend(rec_d['target_idxs']) # see if there are any repeated targets if len(set(taken_slot_idxs)) < len(taken_slot_idxs): raise ResamplerError("repeated slots to be used") # check that the number of targets is exactly the number of slots available if len(taken_slot_idxs) < n_slots: raise ResamplerError("Number of slots used is less than the number of slots") elif len(taken_slot_idxs) > n_slots: raise ResamplerError("Number of slots used is greater than the number of slots") # check that all squashes are going to a merge slot if not all([False if squash_slot_idx not in keep_merge_slot_idxs else True for squash_slot_idx in set(squash_slot_idxs)]): raise ResamplerError("Not all squashes are assigned to keep_merge slots") def _resample_init(self, walkers): super()._resample_init(walkers) # then get the walker nums using our methods to get it for # this resampling and just give that to the region tree self.region_tree.max_num_walkers = self.max_num_walkers() self.region_tree.min_num_walkers = self.min_num_walkers() if self.is_debug_on: # cache a copy of the region_tree in its state before putting # these walkers through it so we can replay steps if necessary self._cached_region_tree = deepcopy(self._region_tree) # and keep the walkers too self._input_walkers = deepcopy(walkers) def _resample_cleanup(self, resampling_data, resampler_data, resampled_walkers): if self.is_debug_on: # check that the weights of the resampled walkers are not # beyond the bounds of what they are supposed to be try: self._check_resampled_walkers(resampled_walkers) except ResamplerError as resampler_err: print(resampler_err) import ipdb; ipdb.set_trace() # keep the tree we just used curr_region_tree = self._region_tree # replace the region tree with the cached region_tree self._region_tree = self._cached_region_tree # then run resample again with the original walkers self.resample(self._input_walkers) # then reset the old region tree self._region_tree = curr_region_tree # and clean out the debug variables del self._cached_region_tree del self._input_walkers # clear the tree of walker information for the next resampling self.region_tree.clear_walkers() # just use the superclass method super()._resample_cleanup() # then get the walker nums using our methods to get it for # this resampling and just give that to the region tree self.region_tree.max_num_walkers = False self.region_tree.min_num_walkers = False def resample(self, walkers): # do some initialiation routines and debugging preparations if # necessary self._resample_init(walkers) ## assign/score the walkers, also getting changes in the ## resampler state assignments, resampler_data = self.assign(walkers) # make the decisions for the the walkers for only a single # step resampling_data = self.decide(delta_walkers=0) # perform the cloning and merging, the action function expects # records a lists of lists for steps and walkers resampled_walkers = self.DECISION.action(walkers, [resampling_data]) # normally decide is only for a single step and so does not # include the step_idx, so we add this to the records for walker_idx, walker_record in enumerate(resampling_data): walker_record['step_idx'] = np.array([0]) # convert the target idxs and decision_id to feature vector arrays for record in resampling_data: record['target_idxs'] = np.array(record['target_idxs']) record['decision_id'] = np.array([record['decision_id']]) record['walker_idx'] = np.array([record['walker_idx']]) # then add the assignments and distance to image for each walker for walker_idx, assignment in enumerate(assignments): resampling_data[walker_idx]['region_assignment'] = assignment self._resample_cleanup(resampling_data, resampler_data, resampled_walkers) return resampled_walkers, resampling_data, resampler_data def _check_resampled_walkers(self, resampled_walkers): walker_weights = np.array([walker.weight for walker in resampled_walkers]) # check that all of the weights are less than or equal to the pmax overweight_walker_idxs = np.where(walker_weights > self.pmax)[0] if len(overweight_walker_idxs) > 0: raise ResamplerError("All walker weights must be less than the pmax, " "walkers {} are all overweight".format( ','.join([str(i) for i in overweight_walker_idxs]))) # check that all walkers are greater than or equal to the pmin underweight_walker_idxs = np.where(walker_weights < self.pmin)[0] if len(underweight_walker_idxs) > 0: raise ResamplerError("All walker weights must be greater than the pmin, " "walkers {} are all underweight".format( ','.join([str(i) for i in underweight_walker_idxs]))) PK!wepy/runners/__init__.pyPK!  :@:@wepy/runners/openmm.pyfrom copy import copy import random as rand from warnings import warn import logging import numpy as np import simtk.openmm.app as omma import simtk.openmm as omm import simtk.unit as unit from wepy.walker import Walker, WalkerState from wepy.runners.runner import Runner from wepy.work_mapper.worker import Worker from wepy.reporter.reporter import Reporter ## Constants KEYS = ('positions', 'velocities', 'forces', 'kinetic_energy', 'potential_energy', 'time', 'box_vectors', 'box_volume', 'parameters', 'parameter_derivatives') # when we use the get_state function from the simulation context we # can pass options for what kind of data to get, this is the default # to get all the data. TODO not really sure what the 'groups' keyword # is for though GET_STATE_KWARG_DEFAULTS = (('getPositions', True), ('getVelocities', True), ('getForces', True), ('getEnergy', True), ('getParameters', True), ('getParameterDerivatives', True), ('enforcePeriodicBox', True),) # TODO unsure of how to use this kwarg #('groups') ) # the Units objects that OpenMM uses internally and are returned from # simulation data UNITS = (('positions_unit', unit.nanometer), ('time_unit', unit.picosecond), ('box_vectors_unit', unit.nanometer), ('velocities_unit', unit.nanometer/unit.picosecond), ('forces_unit', unit.kilojoule / (unit.nanometer * unit.mole)), ('box_volume_unit', unit.nanometer), ('kinetic_energy_unit', unit.kilojoule / unit.mole), ('potential_energy_unit', unit.kilojoule / unit.mole), ) # the names of the units from the units objects above. This is used # for saving them to files UNIT_NAMES = (('positions_unit', unit.nanometer.get_name()), ('time_unit', unit.picosecond.get_name()), ('box_vectors_unit', unit.nanometer.get_name()), ('velocities_unit', (unit.nanometer/unit.picosecond).get_name()), ('forces_unit', (unit.kilojoule / (unit.nanometer * unit.mole)).get_name()), ('box_volume_unit', unit.nanometer.get_name()), ('kinetic_energy_unit', (unit.kilojoule / unit.mole).get_name()), ('potential_energy_unit', (unit.kilojoule / unit.mole).get_name()), ) # a random seed will be chosen from 1 to RAND_SEED_RANGE_MAX when the # Langevin integrator is created. 0 is the default and special value # which will then choose a random value when the integrator is created RAND_SEED_RANGE_MAX = 1000000 # the runner for the simulation which runs the actual dynamics class OpenMMRunner(Runner): def __init__(self, system, topology, integrator, platform=None): # we save the different components. However, if we are to make # this runner picklable we have to convert the SWIG objects to # a picklable form self.system = system self.integrator = integrator # these are not SWIG objects self.topology = topology self.platform_name = platform def _openmm_swig_objects(self): """Just returns all of the foreign OpenMM module objects this class uses that are actually SWIG wrappers.""" return (self.system, self.integrator) def run_segment(self, walker, segment_length, getState_kwargs=None, **kwargs): # set the kwargs that will be passed to getState tmp_getState_kwargs = getState_kwargs getState_kwargs = dict(GET_STATE_KWARG_DEFAULTS) if tmp_getState_kwargs is not None: getState_kwargs.update(tmp_getState_kwargs) # make a copy of the integrator for this particular segment new_integrator = copy(self.integrator) # force setting of random seed to 0, which is a special # value that forces the integrator to choose another # random number new_integrator.setRandomNumberSeed(0) # if a platform was given we use it to make a Simulation object if self.platform_name is not None: # get the platform by its name to use platform = omm.Platform.getPlatformByName(self.platform_name) # set properties from the kwargs if they apply to the platform for key, value in kwargs.items(): if key in platform.getPropertyNames(): platform.setPropertyDefaultValue(key, value) # make a new simulation object simulation = omma.Simulation(self.topology, self.system, new_integrator, platform) # otherwise just use the default or environmentally defined one else: simulation = omma.Simulation(self.topology, self.system, new_integrator) # set the state to the context from the walker simulation.context.setState(walker.state.sim_state) # Run the simulation segment for the number of time steps simulation.step(segment_length) # save the state of the system with all possible values new_sim_state = simulation.context.getState(**getState_kwargs) # make an OpenMMState wrapper with this new_state = OpenMMState(new_sim_state) # create a new walker for this new_walker = OpenMMWalker(new_state, walker.weight) return new_walker class OpenMMState(WalkerState): KEYS = KEYS OTHER_KEY_TEMPLATE = "{}_OTHER" def __init__(self, sim_state, **kwargs): # save the simulation state self._sim_state = sim_state # save additional data if given self._data = {} for key, value in kwargs.items(): # if the key is already in the sim_state keys we need to # modify it and raise a warning if key in self.KEYS: warn("Key {} in kwargs is already taken by this class, renaming to {}".format( self.OTHER_KEY_TEMPLATE).format(key)) # make a new key new_key = self.OTHER_KEY_TEMPLATE.format(key) # set it in the data self._data[new_key] = value # otherwise just set it else: self._data[key] = value @property def sim_state(self): return self._sim_state def __getitem__(self, key): # if this was a key for data not mapped from the OpenMM.State # object we use the _data attribute if key not in self.KEYS: return self._data[key] # otherwise we have to specifically get the correct data and # process it into an array from the OpenMM.State else: if key == 'positions': return self.positions_values() elif key == 'velocities': return self.velocities_values() elif key == 'forces': return self.forces_values() elif key == 'kinetic_energy': return self.kinetic_energy_value() elif key == 'potential_energy': return self.potential_energy_value() elif key == 'time': return self.time_value() elif key == 'box_vectors': return self.box_vectors_values() elif key == 'box_volume': return self.box_volume_value() elif key == 'parameters': return self.parameters_values() elif key == 'parameter_derivatives': return self.parameter_derivatives_values() ## Array properties # Positions @property def positions(self): try: return self.sim_state.getPositions(asNumpy=True) except: warn("Unknown exception handled from `self.sim_state.getPositions()`, " "this is probably because this attribute is not in the State.") return None @property def positions_unit(self): return self.positions.unit def positions_values(self): return self.positions.value_in_unit(self.positions_unit) # Velocities @property def velocities(self): try: return self.sim_state.getVelocities(asNumpy=True) except: warn("Unknown exception handled from `self.sim_state.getVelocities()`, " "this is probably because this attribute is not in the State.") return None @property def velocities_unit(self): return self.velocities.unit def velocities_values(self): velocities = self.velocities if velocities is None: return None else: return self.velocities.value_in_unit(self.velocities_unit) # Forces @property def forces(self): try: return self.sim_state.getForces(asNumpy=True) except: warn("Unknown exception handled from `self.sim_state.getForces()`, " "this is probably because this attribute is not in the State.") return None @property def forces_unit(self): return self.forces.unit def forces_values(self): forces = self.forces if forces is None: return None else: return self.forces.value_in_unit(self.forces_unit) # Box Vectors @property def box_vectors(self): try: return self.sim_state.getPeriodicBoxVectors(asNumpy=True) except: warn("Unknown exception handled from `self.sim_state.getPeriodicBoxVectors()`, " "this is probably because this attribute is not in the State.") return None @property def box_vectors_unit(self): return self.box_vectors.unit def box_vectors_values(self): box_vectors = self.box_vectors if box_vectors is None: return None else: return self.box_vectors.value_in_unit(self.box_vectors_unit) ## non-array properties # Kinetic Energy @property def kinetic_energy(self): try: return self.sim_state.getKineticEnergy() except: warn("Unknown exception handled from `self.sim_state.getKineticEnergy()`, " "this is probably because this attribute is not in the State.") return None @property def kinetic_energy_unit(self): return self.kinetic_energy.unit def kinetic_energy_value(self): kinetic_energy = self.kinetic_energy if kinetic_energy is None: return None else: return np.array([self.kinetic_energy.value_in_unit(self.kinetic_energy_unit)]) # Potential Energy @property def potential_energy(self): try: return self.sim_state.getPotentialEnergy() except: warn("Unknown exception handled from `self.sim_state.getPotentialEnergy()`, " "this is probably because this attribute is not in the State.") return None @property def potential_energy_unit(self): return self.potential_energy.unit def potential_energy_value(self): potential_energy = self.potential_energy if potential_energy is None: return None else: return np.array([self.potential_energy.value_in_unit(self.potential_energy_unit)]) # Time @property def time(self): try: return self.sim_state.getTime() except: warn("Unknown exception handled from `self.sim_state.getTime()`, " "this is probably because this attribute is not in the State.") return None @property def time_unit(self): return self.time.unit def time_value(self): time = self.time if time is None: return None else: return np.array([self.time.value_in_unit(self.time_unit)]) # Box Volume @property def box_volume(self): try: return self.sim_state.getPeriodicBoxVolume() except: warn("Unknown exception handled from `self.sim_state.getPeriodicBoxVolume()`, " "this is probably because this attribute is not in the State.") return None @property def box_volume_unit(self): return self.box_volume.unit def box_volume_value(self): box_volume = self.box_volume if box_volume is None: return None else: return np.array([self.box_volume.value_in_unit(self.box_volume_unit)]) ## Dictionary properties ## Unitless # Parameters @property def parameters(self): try: return self.sim_state.getParameters() except: warn("Unknown exception handled from `self.sim_state.getParameters()`, " "this is probably because this attribute is not in the State.") return None @property def parameters_unit(self): param_units = {key : None for key, val in self.parameters.items()} return param_units def parameters_values(self): if self.parameters is None: return None param_arrs = {key : np.array(val) for key, val in self.parameters.items()} # return None if there is nothing in this if len(param_arrs) == 0: return None else: return param_arrs # Parameter Derivatives @property def parameter_derivatives(self): try: return self.sim_state.getEnergyParameterDerivatives() except: warn("Unknown exception handled from `self.sim_state.getEnergyParameterDerivatives()`, " "this is probably because this attribute is not in the State.") return None @property def parameter_derivatives_unit(self): param_units = {key : None for key, val in self.parameter_derivatives.items()} return param_units def parameter_derivatives_values(self): if self.parameter_derivatives is None: return None param_arrs = {key : np.array(val) for key, val in self.parameter_derivatives.items()} # return None if there is nothing in this if len(param_arrs) == 0: return None else: return param_arrs def omm_state_dict(self): """return a dict of the values for the keys that are hardcoded in this class.""" return {'positions' : self.positions_values(), 'velocities' : self.velocities_values(), 'forces' : self.forces_values(), 'kinetic_energy' : self.kinetic_energy_value(), 'potential_energy' : self.potential_energy_value(), 'time' : self.time_value(), 'box_vectors' : self.box_vectors_values(), 'box_volume' : self.box_volume_value(), 'parameters' : self.parameters_values(), 'parameter_derivatives' : self.parameter_derivatives_values() } def dict(self): """Return a dict of the values for all attributes of this state.""" d = {} for key, value in self._data.items(): d[key] = value for key, value in self.omm_state_dict().items(): d[key] = value return d def to_mdtraj(self): """ Returns an mdtraj.Trajectory object from this walker's state.""" raise NotImplementedError import mdtraj as mdj # resize the time to a 1D vector return mdj.Trajectory(self.positions_values, time=self.time_value[:,0], unitcell_vectors=self.box_vectors_values) class OpenMMWalker(Walker): def __init__(self, state, weight): assert isinstance(state, OpenMMState), \ "state must be an instance of class OpenMMState not {}".format(type(state)) super().__init__(state, weight) class OpenMMGPUWorker(Worker): def run_task(self, task): # run the task and pass in the DeviceIndex for OpenMM to # assign work to the correct GPU return task(DeviceIndex=str(self.worker_idx)) PK!< wepy/runners/randomwalk.pyimport random as rand import logging import numpy as np from simtk import unit from wepy.runners.runner import Runner from wepy.walker import Walker, WalkerState UNIT_NAMES = (('positions_unit', unit.nanometer.get_name()), ('time_unit', unit.picosecond.get_name()), ) class RandomWalkRunner(Runner): """RandomWalkRunner is an object for implementing the dynamic of RandomWalk system. To use it, you need to provide the number of dimensions and the probability of movement. """ def __init__(self, dimension=2, probability=0.25): """Initialize RandomWalk object with the number of dimension and probability. :param dimension: integer :param probability: float """ self.dimension = dimension self.probability = probability def walk(self, positions): """Impliment the dynamic of RandomWalk system for one step. Takes the current position vector as input and based on the probability generates new position for each dimension and returns new posiotion vector. :param positions: a numpy array of shape (1, dimension) :returns: new posiotion :rtype:a numpy array of shape (1, dimension) """ # make the deep copy of current posiotion new_positions = positions.copy() # iterates over each dimension for dimension in range(self.dimension): # Generates an uniform random number to choose between increasing or decreasing position r = rand.uniform(0, 1) # make a forward movement if r < self.probability: new_positions[0][dimension] += 1 # make a backward movement else: new_positions[0][dimension] -= 1 # implement the boundary condition for movement, movements to -1 are rejected if new_positions[0][dimension] < 0: new_positions[0][dimension] = 0 return new_positions def run_segment(self, walker, segment_length): """Run dynamics of RandomWalk system for the number of steps that is specified by segment_length. :param walker: a RandomWalk object :param segment_length: the number of steps :returns: a RandomWalk object with new positions :rtype: """ # Gets the current posiotion of RandomWalk Walker positions = walker.state['positions'] # Make movements for the segment_length steps for segment_idx in range(segment_length): # calls walk function for one step movement new_positions = self.walk(positions) positions = new_positions # makes new state form new positions new_state = WalkerState(positions=new_positions, time=0.0) # creates new_walker from new state and current weight new_walker = Walker(new_state, walker.weight) return new_walker PK!ė??wepy/runners/runner.pyimport logging class Runner(object): def run_segment(self, init_walkers, segment_length): raise NotImplementedError class NoRunner(Runner): """ Stub class that just returns the walkers back with the same state.""" def run_segment(self, init_walkers, segment_length): return init_walkers PK!Z%Z%wepy/sim_manager.pyimport sys import time from copy import deepcopy import logging from wepy.work_mapper.mapper import Mapper class Manager(object): def __init__(self, init_walkers, runner = None, resampler = None, boundary_conditions = None, reporters = None, work_mapper = None ): self.init_walkers = init_walkers self.n_init_walkers = len(init_walkers) # the runner is the object that runs dynamics self.runner = runner # the resampler self.resampler = resampler # object for boundary conditions self.boundary_conditions = boundary_conditions # the method for writing output if reporters is None: self.reporters = [] else: self.reporters = reporters self.work_mapper = work_mapper def run_segment(self, walkers, segment_length): """Run a time segment for all walkers using the available workers. """ num_walkers = len(walkers) logging.info("Starting segment") new_walkers = list(self.work_mapper.map(walkers, (segment_length for i in range(num_walkers)), ) ) logging.info("Ending segment") return new_walkers def run_cycle(self, walkers, segment_length, cycle_idx): logging.info("Begin cycle {}".format(cycle_idx)) # run the segment start = time.time() new_walkers = self.run_segment(walkers, segment_length) end = time.time() runner_time = end - start logging.info("End cycle {}".format(cycle_idx)) # boundary conditions should be optional; # initialize the warped walkers to the new_walkers and # change them later if need be warped_walkers = new_walkers warp_data = [] bc_data = [] progress_data = [] bc_time = 0.0 if self.boundary_conditions is not None: # apply rules of boundary conditions and warp walkers through space start = time.time() bc_results = self.boundary_conditions.warp_walkers(new_walkers, cycle_idx) end = time.time() bc_time = end - start # warping results warped_walkers = bc_results[0] warp_data = bc_results[1] bc_data = bc_results[2] progress_data = bc_results[3] if len(warp_data) > 0: logging.info("Returned warp record in cycle {}".format(cycle_idx)) # resample walkers start = time.time() resampling_results = self.resampler.resample(warped_walkers) end = time.time() resampling_time = end - start resampled_walkers = resampling_results[0] resampling_data = resampling_results[1] resampler_data = resampling_results[2] # log the weights of the walkers after resampling result_template_str = "|".join(["{:^5}" for i in range(self.n_init_walkers + 1)]) walker_weight_str = result_template_str.format("weight", *[round(walker.weight, 3) for walker in resampled_walkers]) logging.info(walker_weight_str) # report results to the reporters for reporter in self.reporters: reporter.report(cycle_idx, new_walkers, warp_data, bc_data, progress_data, resampling_data, resampler_data, n_steps=segment_length, worker_segment_times=self.work_mapper.worker_segment_times, cycle_runner_time=runner_time, cycle_bc_time=bc_time, cycle_resampling_time=resampling_time, resampled_walkers=resampled_walkers) # prepare resampled walkers for running new state changes walkers = resampled_walkers # we also return a list of the "filters" which are the # classes that are run on the initial walkers to produce # the final walkers. THis is to satisfy a future looking # interface in which the order and components of these # filters are completely parametrizable. This may or may # not be implemented in a future release of wepy but this # interface is assumed by the orchestration classes for # making snapshots of the simulations. The receiver of # these should perform the copy to make sure they aren't # mutated. We don't do this here for efficiency. filters = [self.runner, self.boundary_conditions, self.resampler] return walkers, filters def init(self, num_workers=None, continue_run=None): logging.info("Starting simulation") # initialize the work_mapper with the function it will be # mapping and the number of workers, this may include things like starting processes # etc. self.work_mapper.init(segment_func=self.runner.run_segment, num_workers=num_workers) # init the reporter for reporter in self.reporters: reporter.init(init_walkers=self.init_walkers, runner=self.runner, resampler=self.resampler, boundary_conditions=self.boundary_conditions, work_mapper=self.work_mapper, reporters=self.reporters, continue_run=continue_run) def cleanup(self): # cleanup the mapper self.work_mapper.cleanup() # cleanup things associated with the reporter for reporter in self.reporters: reporter.cleanup(runner=self.runner, work_mapper=self.work_mapper, resampler=self.resampler, boundary_conditions=self.boundary_conditions, reporters=self.reporters) def run_simulation_by_time(self, run_time, segments_length, num_workers=None): """Run a simulation for a certain amount of time. This starts timing as soon as this is called. If the time before running a new cycle is greater than the runtime the run will exit after cleaning up. Once a cycle is started it may also run over the wall time. run_time :: float (in seconds) segments_length :: int ; number of iterations performed for each walker segment for each cycle """ start_time = time.time() self.init(num_workers=num_workers) cycle_idx = 0 walkers = self.init_walkers while time.time() - start_time < run_time: logging.info("starting cycle {} at time {}".format(cycle_idx, time.time() - start_time)) walkers, filters = self.run_cycle(walkers, segments_length, cycle_idx) logging.info("ending cycle {} at time {}".format(cycle_idx, time.time() - start_time)) cycle_idx += 1 self.cleanup() return walkers, deepcopy(filters) def run_simulation(self, n_cycles, segment_lengths, num_workers=None): """Run a simulation for a given number of cycles with specified lengths of MD segments in between. """ self.init(num_workers=num_workers) walkers = self.init_walkers # the main cycle loop for cycle_idx in range(n_cycles): walkers, filters = self.run_cycle(walkers, segment_lengths[cycle_idx], cycle_idx) self.cleanup() return walkers, deepcopy(filters) def continue_run_simulation(self, run_idx, n_cycles, segment_lengths, num_workers=None): """Continue a simulation. All this does is provide a run idx to the reporters, which is the run that is intended to be continued. This simulation manager knows no details and is left up to the reporters to handle this appropriately. """ self.init(num_workers=num_workers, continue_run=run_idx) walkers = self.init_walkers # the main cycle loop for cycle_idx in range(n_cycles): walkers, filters = self.run_cycle(walkers, segment_lengths[cycle_idx], cycle_idx) self.cleanup() return walkers, filters def continue_run_simulation_by_time(self, run_idx, run_time, segments_length, num_workers=None): """Continue a simulation. All this does is provide a run idx to the reporters, which is the run that is intended to be continued. This simulation manager knows no details and is left up to the reporters to handle this appropriately. """ start_time = time.time() self.init(num_workers=num_workers, continue_run=run_idx) cycle_idx = 0 walkers = self.init_walkers while time.time() - start_time < run_time: logging.info("starting cycle {} at time {}".format(cycle_idx, time.time() - start_time)) walkers, filters = self.run_cycle(walkers, segments_length, cycle_idx) logging.info("ending cycle {} at time {}".format(cycle_idx, time.time() - start_time)) cycle_idx += 1 self.cleanup() return walkers, filters PK!wepy/util/__init__.pyPK!vo/wepy/util/mdtraj.pyimport json from warnings import warn import operator import numpy as np import mdtraj as mdj import mdtraj.core.element as elem # the following method contains portions of the software mdtraj which # is distributed under the following license ############################################################################## # MDTraj: A Python Library for Loading, Saving, and Manipulating # Molecular Dynamics Trajectories. # Copyright 2012-2014 Stanford University and the Authors # # Authors: Peter Eastman, Robert McGibbon # Contributors: Kyle A. Beauchamp, Matthew Harrigan, Carlos Xavier Hernandez # # MDTraj is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as # published by the Free Software Foundation, either version 2.1 # of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with MDTraj. If not, see . # # Portions of this code originate from the OpenMM molecular simulation # toolkit, copyright (c) 2012 Stanford University and Peter Eastman. Those # portions are distributed under the following terms: # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE # USE OR OTHER DEALINGS IN THE SOFTWARE. ############################################################################## def mdtraj_to_json_topology(mdj_top): """ Copied in part from MDTraj.formats.hdf5.topology setter. """ topology_dict = { 'chains': [], 'bonds': [] } for chain in mdj_top.chains: chain_dict = { 'residues': [], 'index': int(chain.index) } for residue in chain.residues: residue_dict = { 'index': int(residue.index), 'name': str(residue.name), 'atoms': [], "resSeq": int(residue.resSeq) } for atom in residue.atoms: try: element_symbol_string = str(atom.element.symbol) except AttributeError: element_symbol_string = "" residue_dict['atoms'].append({ 'index': int(atom.index), 'name': str(atom.name), 'element': element_symbol_string }) chain_dict['residues'].append(residue_dict) topology_dict['chains'].append(chain_dict) for atom1, atom2 in mdj_top.bonds: topology_dict['bonds'].append([ int(atom1.index), int(atom2.index) ]) top_json_str = json.dumps(topology_dict) return top_json_str def json_to_mdtraj_topology(json_string): """ Copied in part from MDTraj.formats.hdf5 topology property.""" topology_dict = json.loads(json_string) topology = mdj.Topology() for chain_dict in sorted(topology_dict['chains'], key=operator.itemgetter('index')): chain = topology.add_chain() for residue_dict in sorted(chain_dict['residues'], key=operator.itemgetter('index')): try: resSeq = residue_dict["resSeq"] except KeyError: resSeq = None warn('No resSeq information found in HDF file, defaulting to zero-based indices') try: segment_id = residue_dict["segmentID"] except KeyError: segment_id = "" residue = topology.add_residue(residue_dict['name'], chain, resSeq=resSeq, segment_id=segment_id) for atom_dict in sorted(residue_dict['atoms'], key=operator.itemgetter('index')): try: element = elem.get_by_symbol(atom_dict['element']) except KeyError: element = elem.virtual topology.add_atom(atom_dict['name'], element, residue) atoms = list(topology.atoms) for index1, index2 in topology_dict['bonds']: topology.add_bond(atoms[index1], atoms[index2]) return topology PK!޸!wepy/util/util.pyimport json import numpy as np def traj_box_vectors_to_lengths_angles(traj_box_vectors): """Convert box vectors for multiple 'frames' (a 'trajectory') to box lengths and angles.""" traj_unitcell_lengths = [] for basis in traj_box_vectors: traj_unitcell_lengths.append(np.array([np.linalg.norm(frame_v) for frame_v in basis])) traj_unitcell_lengths = np.array(traj_unitcell_lengths) traj_unitcell_angles = [] for vs in traj_box_vectors: angles = np.array([np.degrees( np.arccos(np.dot(vs[i], vs[j])/ (np.linalg.norm(vs[i]) * np.linalg.norm(vs[j])))) for i, j in [(0,1), (1,2), (2,0)]]) traj_unitcell_angles.append(angles) traj_unitcell_angles = np.array(traj_unitcell_angles) return traj_unitcell_lengths, traj_unitcell_angles def box_vectors_to_lengths_angles(box_vectors): """Convert box vectors for a single 'frame' to lengths and angles.""" # calculate the lengths of the vectors through taking the norm of # them unitcell_lengths = [] for basis in box_vectors: unitcell_lengths.append(np.linalg.norm(basis)) unitcell_lengths = np.array(unitcell_lengths) # calculate the angles for the vectors unitcell_angles = np.array([np.degrees( np.arccos(np.dot(box_vectors[i], box_vectors[j])/ (np.linalg.norm(box_vectors[i]) * np.linalg.norm(box_vectors[j])))) for i, j in [(0,1), (1,2), (2,0)]]) return unitcell_lengths, unitcell_angles def json_top_atom_count(json_str): """Count the number of atoms in a JSON topology used by wepy HDF5.""" top_d = json.loads(json_str) atom_count = 0 atom_count = 0 for chain in top_d['chains']: for residue in chain['residues']: atom_count += len(residue['atoms']) return atom_count PK!o wepy/walker.pyimport random as rand import logging def split(walker, number=2): # calculate the weight of all child walkers split uniformly split_prob = walker.weight / (number) # make the clones clones = [] for i in range(number): clones.append(type(walker)(walker.state, split_prob)) return clones def keep_merge(walkers, keep_idx): weights = [walker.weight for walker in walkers] # but we add their weight to the new walker new_weight = sum(weights) # create a new walker with the keep_walker state new_walker = type(walkers[0])(walkers[keep_idx].state, new_weight) return new_walker def merge(walkers): """Merge this walker with another keeping the state of one of them and adding the weights. """ weights = [walker.weight for walker in walkers] # choose a walker according to their weights to keep its state keep_walker = rand.choices(walkers, weights=weights) keep_idx = walkers.index(keep_walker) # TODO do we need this? # the others are "squashed" and we lose their state # squashed_walkers = set(walkers).difference(keep_walker) # but we add their weight to the new walker new_weight = sum(weights) # create a new walker with the keep_walker state new_walker = type(walkers[0])(keep_walker.state, new_weight) return new_walker, keep_idx class Walker(object): def __init__(self, state, weight): self.state = state self.weight = weight def clone(self, number=1): """Clone this walker by making a copy with the same state and split the probability uniformly between clones. The number is the increase in the number of walkers. e.g. number=1 will return 2 walkers with the same state as this object but with probability split 50/50 between them """ # calculate the weight of all child walkers split uniformly split_prob = self.weight / (number+1) # make the clones clones = [] for i in range(number+1): clones.append(type(self)(self.state, split_prob)) return clones def squash(self, merge_target): new_weight = self.weight + merge_target.weight return type(self)(merge_target.state, new_weight) def merge(self, other_walkers): return merge([self]+other_walkers) class WalkerState(object): def __init__(self, **kwargs): self._data = kwargs def __getitem__(self, key): return self._data[key] def dict(self): return self._data PK!wepy/work_mapper/__init__.pyPK!}KSJJwepy/work_mapper/mapper.pyfrom multiprocessing import Queue, JoinableQueue import queue import time import logging import multiprocessing as mp from wepy.work_mapper.worker import Worker, Task PY_MAP = map class ABCMapper(object): def __init__(self, **kwargs): pass def init(self, **kwargs): pass def cleanup(self, **kwargs): pass def map(self, **kwargs): pass class Mapper(object): def __init__(self, *args, **kwargs): self._worker_segment_times = {0 : []} def init(self, segment_func=None, **kwargs): if segment_func is None: ValueError("segment_func must be given") self._func = segment_func def cleanup(self, **kwargs): # nothing to do pass def map(self, *args, **kwargs): args = [list(arg) for arg in args] segment_times = [] results = [] for arg_idx in range(len(args[0])): start = time.time() result = self._func(*[arg[arg_idx] for arg in args]) end = time.time() segment_time = end - start segment_times.append(segment_time) results.append(result) self._worker_segment_times[0] = segment_times return results @property def worker_segment_times(self): return self._worker_segment_times class WorkerMapper(Mapper): def __init__(self, num_workers=None, worker_type=None, **kwargs): self._num_workers = num_workers self._worker_segment_times = {i : [] for i in range(self.num_workers)} # choose the type of the worker if worker_type is None: self._worker_type = Worker else: self._worker_type = worker_type @property def num_workers(self): return self._num_workers @num_workers.setter def num_workers(self, num_workers): self._num_workers = num_workers @property def worker_type(self): return self._worker_type @worker_type.setter def worker_type(self, worker_type): self._worker_type = worker_type def init(self, num_workers=None, **kwargs): super().init(**kwargs) # the number of workers must be given here or set as an object attribute if num_workers is None and self.num_workers is None: raise ValueError("The number of workers must be given, received {}".format(num_workers)) # if the number of walkers was given for this init() call use # that, otherwise we use the default that was specified when # the object was created elif num_workers is None and self.num_workers is not None: num_workers = self.num_workers # Establish communication queues self._task_queue = JoinableQueue() self._result_queue = Queue() # Start workers, giving them all the queues self._workers = [self.worker_type(i, self._task_queue, self._result_queue) for i in range(num_workers)] # start the worker processes for worker in self._workers: worker.start() logging.info("Worker process started as name: {}; PID: {}".format(worker.name, worker.pid)) def cleanup(self): # send poison pills (Stop signals) to the queues to stop them in a nice way # and let them finish up for i in range(self.num_workers): self._task_queue.put((None, None)) # delete the queues and workers self._task_queue = None self._result_queue = None self._workers = None def make_task(self, *args, **kwargs): return Task(self._func, *args, **kwargs) def map(self, *args): map_process = mp.current_process() logging.info("Mapping from process {}; PID {}".format(map_process.name, map_process.pid)) # make tuples for the arguments to each function call task_args = zip(*args) num_tasks = len(args[0]) # Enqueue the jobs for task_idx, task_arg in enumerate(task_args): # a task will be the actual task and its task idx so we can # sort them later self._task_queue.put((task_idx, self.make_task(*task_arg))) logging.info("Waiting for tasks to be run") # Wait for all of the tasks to finish self._task_queue.join() # workers_done = [worker.done for worker in self._workers] # if all(workers_done): # get the results out in an unordered way. We rely on the # number of tasks we know we put out because if you just try # to get from the queue until it is empty it will just wait # forever, since nothing is there. ALternatively it is risky # to implement a wait timeout or no wait in case there is a # small wait time. logging.info("Retrieving results") n_results = num_tasks results = [] while n_results > 0: logging.info("trying to retrieve result: {}".format(n_results)) result = self._result_queue.get() results.append(result) logging.info("Retrieved result {}: {}".format(n_results, result)) n_results -= 1 logging.info("No more results") logging.info("Retrieved results") # sort the results according to their task_idx results.sort() # save the task run times, so they can be accessed if desired, # after clearing the task times from the last mapping self._worker_segment_times = {i : [] for i in range(self.num_workers)} for task_idx, worker_idx, task_time, result in results: self._worker_segment_times[worker_idx].append(task_time) # then just return the values of the function return [result for task_idx, worker_idx, task_time, result in results] PK!4  wepy/work_mapper/worker.pyfrom multiprocessing import Process import multiprocessing as mp import time import logging class Worker(Process): def __init__(self, worker_idx, task_queue, result_queue): # call the Process constructor Process.__init__(self) self.worker_idx = worker_idx # the queues for work to be done and work done self.task_queue = task_queue self.result_queue = result_queue def run_task(self, task): return task() def run(self): worker_process = mp.current_process() logging.info("Worker process started as name: {}; PID: {}".format(worker_process.name, worker_process.pid)) while True: # get the next task task_idx, next_task = self.task_queue.get() # # check for the poison pill which is the signal to stop if next_task is None: logging.info('Worker: {}; received {} {}: FINISHED'.format( self.name, task_idx, next_task)) # mark the poison pill task as done self.task_queue.task_done() # and exit the loop break logging.info('Worker: {}; task_idx : {}; args : {} '.format( self.name, task_idx, next_task.args)) # run the task start = time.time() answer = self.run_task(next_task) end = time.time() task_time = end - start logging.info('Worker: {}; task_idx : {}; COMPLETED in {} s'.format( self.name, task_idx, task_time)) # (for joinable queue) tell the queue that the formerly # enqued task is complete self.task_queue.task_done() # put the results into the results queue with it's task # index so we can sort them later self.result_queue.put((task_idx, self.worker_idx, task_time, answer)) class Task(object): def __init__(self, func, *args): self.args = args self.func = func def __call__(self, **kwargs): # run the function passing in the args for running it and any # worker information in the kwargs return self.func(*self.args, **kwargs) PK!HD613%wepy-0.9.0.dist-info/entry_points.txtN+I/N.,()*O-zE%E%yz9V@PK!u[{,,wepy-0.9.0.dist-info/LICENSEMIT License Copyright (c) 2017 ADicksonLab Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HW"TTwepy-0.9.0.dist-info/WHEEL A н#J."jm)Afb~ ڡ5 G7hiޅF4+-3ڦ/̖?XPK!H wepy-0.9.0.dist-info/METADATAX]s6}ǯ$/vWl糚vg]۸c';t'@$$!&-{Hɱ;X$p?=^^*Wqv*CA5z*W]A}q Q~=6eԕq6jǟ-r}2V^!b_q*FiZ7p ;؜;/}g-yՋY9 (nYHRKeiN. dV@MHEz+[CS!?ATEL5$yׅ~T-<)D]F/U&ʰt]]A\›zJaNv+0AC>̺( @~ն80 QvAg_E-uNnuOPAR>rm,˗b^v z]N\m|/.nnh6|vݔKoǵT().\$¹ Gu]>k;[]77BuUU#%HP=WBU #glQϐOV **N:1 6xڥ,+ر~xG\_V!dJ2l-3U6]$[MAWLP4:]p+۰ 5΍HH\vÃ&}b N\xT.0:@5tC %7Hrh]$jТ4s*Tne*2q̘(yq_)@؋]W8>SwѮ-',3aV?-rcI Bݞ `)On Lfy)ߕ[4uA )8rC ӀLtf0+Lu%Tjͽ0`uUol؃:3Gl@]n!-aew OT%6,)I/Z(jƝz=9V 2M&G n!_IygUXG[4̴Em;>I[ j hxVLQJ)dML3 x)4Ci0!h?hl/Vsml2eo Ԇ19pUO={AulI.J<\r: pTw4Wr/ 9pf"͇t؊AT h!!)4/,וw5MT{>_2b(}忆WYy~ a aTȣ÷Z<{Gm [T mNofo_/~"E>辴Vzpw@u(^ߗIAL{Ɯ*%̻[Macд1ƞ= AL\pm04!`CPanfnJkxHF g+u !bQ#Q@R0$tz?,nv3*W`/nS>eB9^`IfGڇ&њ :W8?dž2ـ8H}K` {';mjm0[%VٺKD_XO xԤ)z+\5˳&B[=NN3ꟿ"Ovҡ/ pRAxG R`إ޷5mۺ`abLoef[2l0P#0O41}o^$,Y34m@N[<^kT^6-Y?`+ÆgWOPK!HHW5Mwepy-0.9.0.dist-info/RECORDIH}|<,z "" ΡxYo 3Fč{ s*hc脨+2VjkpJ赆 K:9Y?ѮJe-e ;r vO2 ^ @k ?IgۿB Sc~a>@{|Ļ# |U6iʢK"k`#jzG_ΙNhMi*@^vB, a I-n,Ft}Ҫcd6]jHacuPܟI5 ik7byXrvl 3HjgdOAH>1A#`5N&]R[t|1wߖF/͍xdn`Y+S6GP z[4F5hΦ''Nh)cĶ:WNzυA)&p6zKw MnЌ"2$r=`*_u׌2I.ډzQԴ%S0~m=dO [^[nP,$о@cƋv /5dÁEjҴTQ dj TlE9W-YaOg7j) V+J5ztǓkcnT6u*w9{<q4i@:[kkB0-MPM4?w-xptIOrr :n9 7©}?'h9%8=2xrD0-2Yկ >ؐ5jLsCOj=Sat'a$@ El;6jTwC 6=fnZ?žL g8{۠ E$dyy7Y_˟!j <SM+~]nvM =SLAEo4cK#\v4NQǫn1XIt $JE|BB t][/D }9bH:SI6't F`k[Q@dG}N,UޚS*i&'c~_o&2Tf@s™`wxJ2gj5g5|0; wIYͪA#Әc7n9f^'E<1RF[?|AI﫫P؄ϧfa\H-4|,}MqrR5[e&\b>V#g%Tڱ?LS_PK!Rb CHANGELOG.orgPK!u[{,,LICENSEPK!wepy/__init__.pyPK!0wepy/analysis/__init__.pyPK!YTTgwepy/analysis/contig_tree.pyPK!+((Ywepy/analysis/network.pyPK!~__wepy/analysis/parents.pyPK!?o<wepy/analysis/transitions.pyPK!$wepy/boundary_conditions/__init__.pyPK!I $0wepy/boundary_conditions/boundary.pyPK! W 66%wepy/boundary_conditions/rebinding.pyPK!~  %twepy/boundary_conditions/unbinding.pyPK!1JJ wepy/hdf5.pyPK!5wepy/orchestration/__init__.pyPK!FO5(5(qwepy/orchestration/cli.pyPK!{Hh))#wepy/orchestration/configuration.pyPK!|N @cc"Gwepy/orchestration/orchestrator.pyPK!5|wepy/reporter/__init__.pyPK!߇l|wepy/reporter/dashboard.pyPK!k>>+~wepy/reporter/hdf5.pyPK!mqPwepy/reporter/reporter.pyPK!9?kgwepy/reporter/restart.pyPK!MԲ  Twepy/reporter/setup.pyPK!"wepy/reporter/wexplore/__init__.pyPK!ﱃJJ#wepy/reporter/wexplore/dashboard.pyPK!Awepy/resampling/__init__.pyPK!%Bwepy/resampling/decisions/__init__.pyPK!  (^Bwepy/resampling/decisions/clone_merge.pyPK!%Ip%Ywepy/resampling/decisions/decision.pyPK!%hwepy/resampling/distances/__init__.pyPK!+f))%iwepy/resampling/distances/distance.pyPK!tI>I>#|nwepy/resampling/distances/openmm.pyPK!'wepy/resampling/distances/randomwalk.pyPK!&8%ްwepy/resampling/distances/receptor.pyPK!&׹wepy/resampling/resamplers/__init__.pyPK!^S(S($wepy/resampling/resamplers/random.pyPK! k22'wepy/resampling/resamplers/resampler.pyPK!77"wepy/resampling/resamplers/revo.pyPK! Wݜ;;&Mwepy/resampling/resamplers/wexplore.pyPK!wepy/runners/__init__.pyPK!  :@:@wepy/runners/openmm.pyPK!< $wepy/runners/randomwalk.pyPK!ė??wepy/runners/runner.pyPK!Z%Z%iwepy/sim_manager.pyPK!wepy/util/__init__.pyPK!vo/'wepy/util/mdtraj.pyPK!޸!/wepy/util/util.pyPK!o wepy/walker.pyPK! $wepy/work_mapper/__init__.pyPK!}KSJJZ$wepy/work_mapper/mapper.pyPK!4  ;wepy/work_mapper/worker.pyPK!HD613%Ewepy-0.9.0.dist-info/entry_points.txtPK!u[{,,Ewepy-0.9.0.dist-info/LICENSEPK!HW"TTIwepy-0.9.0.dist-info/WHEELPK!H Jwepy-0.9.0.dist-info/METADATAPK!HHW5MXUwepy-0.9.0.dist-info/RECORDPK88=]