PK!`\stellargraph/__init__.py""" Stellar Machine Learning Library """ __all__ = ["data", "layer", "mapper", "StellarDiGraph", "StellarGraph"] # Top-level imports from stellargraph.core.graph import StellarGraph, StellarDiGraph from stellargraph.core.schema import GraphSchema PK!·lKstellargraph/core/__init__.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This contains the core objects used by the StellarGraph library. """ from .graph import * from .schema import * PK!TDfDfstellargraph/core/graph.py# -*- coding: utf-8 -*- # # Copyright 2017-2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ The StellarGraph class that encapsulates information required for a machine-learning ready graph used by models. """ __all__ = ["StellarGraph", "StellarDiGraph", "StellarGraphBase"] from stellargraph.core.schema import EdgeType import random import itertools as it import pandas as pd import numpy as np from networkx.classes.multigraph import MultiGraph from networkx.classes.multidigraph import MultiDiGraph from collections import Iterable, Iterator from .. import globalvar from .schema import GraphSchema from .utils import is_real_iterable def _convert_from_node_attribute( G, attr_name, node_types, node_type_name=None, dtype="f" ): """ Transform the node attributes to feature vectors, for use with machine learning models. Each node is assumed to have a numeric array stored in the attribute_name and which is suitable for use in machine learning models. Args: G: NetworkX graph attr_name: Name of node attribute to use for conversion node_types: Node types in graph node_type_name: (optional) The name of the node attribute specifying the type. dtype: (optional) The numpy datatype to create the features array. Returns: index_map: a dictionary of node_type -> {node_id: node_index} attribute_arrays: a dictionary of node_type -> numpy array storing the features """ attribute_arrays = {} node_index_map = {} # Enumerate all nodes in graph nodes_by_type = { nt: [n for n, ndata in G.nodes(data=True) if ndata[node_type_name] == nt] for nt in node_types } # Get the target values for each node type for nt in node_types: nt_node_list = nodes_by_type[nt] # Add None to node list as ID of unknown nodes nt_node_list.append(None) # Create map between node id and index (including None) node_index_map[nt] = {nid: ii for ii, nid in enumerate(nt_node_list)} # The node data attr_data = [v if v is None else G.node[v].get(attr_name) for v in nt_node_list] # Get the size of the features data_sizes = { np.size(G.node[v].get(attr_name, [])) for v in nt_node_list if v is not None } # Warn if nodes don't have the attribute if 0 in data_sizes: print( "Warning: Some nodes have no value for attribute '{}', " "using default value.".format(attr_name) ) data_sizes.discard(0) # Check all are the same for this node type if len(data_sizes) > 1: raise ValueError( "Data sizes in nodes of type {} are inconsistent " "for the attribute '{}' ".format(nt, attr_name) ) # If some node_type have no nodes with the attribute, skip them if len(data_sizes) == 0: continue # Create zero attribute array data_size = data_sizes.pop() # Dummy feature/target value for invalid nodes, # this will be inserted into the array in two cases: # 1. node ID of None (representing sampling for a missing neighbour) # 2. node with no attribute # TODO: Make these two cases more explicit, allow custom values. default_value = np.zeros(data_size) # Convert to numpy array attribute_arrays[nt] = np.asarray( [x if x is not None else default_value for x in attr_data] ) return node_index_map, attribute_arrays def _convert_from_node_data(data, node_type_map, node_types, dtype="f"): """ Store the node data as feature vectors, for use with machine learning models. For a single node type, the data can be either: * a Pandas DataFrame with the index being node IDs and the columns the numeric feature values. Note that the features must be numeric. * a list or iterable of `(node_id, node_feature)` pairs where node_feature is a value, a list of values, or a numpy array representing the numeric feature values. For multiple node types, the data can be either: * a dictionary of node_type -> DataFrame with the index of each DataFrame being node IDs and the columns the numeric feature values. Note that the features must be numeric and can be different sizes for each node type. * a list or iterable of `(node_id, node_feature)` pairs where node_feature is a value, a list of values, or a numpy array representing the numeric feature values. Args: data: dict, list or DataFrame The data for the nodes, partitioned by node type node_type_map: dict Mapping of node_id to node_type node_types: list List of the node types in the data dtype: Numpy datatype optional (default='float32') The numpy datatype to create the features array. Returns: index_map: a dictionary of node_type -> {node_id: node_index} attribute_arrays: a dictionary of node_type -> numpy array storing the features """ # if data is a dict of pandas dataframes or iterators, pull the features for each node type in the dictionary if isinstance(data, dict): # The keys should match the node types if not all(k in node_types for k in data.keys()): raise ValueError( "All node types in supplied feature dict should be in the graph" ) data_arrays = {} data_index = {} for nt, arr in data.items(): if isinstance(arr, pd.DataFrame): node_index_map = {nid: nii for nii, nid in enumerate(arr.index)} try: data_arr = arr.values.astype(dtype) except ValueError: raise ValueError( "Node data passed as Pandas arrays should contain only numeric values" ) elif isinstance(arr, (Iterable, list)): data_arr = [] node_index_map = {} for ii, (node_id, datum) in enumerate(arr): data_arr.append(datum) node_index_map[node_id] = ii data_arr = np.vstack(data_arr) else: raise TypeError( "Node data should be a pandas array, an iterable, a list, or name of a node_attribute" ) # Add default value to end of feature array default_value = np.zeros(data_arr.shape[1]) data_arrays[nt] = np.vstack([data_arr, default_value]) node_index_map[None] = data_arr.shape[0] data_index[nt] = node_index_map # If data is a pd.Dataframe, try pulling out the type elif isinstance(data, pd.DataFrame): if len(node_types) > 1: raise TypeError( "When there is more than one node type, pass node features as a dictionary." ) node_type = next(iter(node_types)) data_index, data_arrays = _convert_from_node_data( {node_type: data}, node_type_map, node_types, dtype ) # If data an iterator try recreating the nodes by type elif isinstance(data, (Iterator, list)): node_data_by_type = {nt: [] for nt in node_types} for d in data: node_type = node_type_map.get(d[0]) if node_type is None: raise TypeError("Node type not found in importing feature vectors!") node_data_by_type[node_type].append(d) data_index, data_arrays = _convert_from_node_data( node_data_by_type, node_type_map, node_types, dtype ) else: raise TypeError( "Node data should be a dictionary, a pandas array, an iterable, or a tuple." ) return data_index, data_arrays class StellarGraphBase: """ StellarGraph class for undirected graph ML models. It stores both graph information from a NetworkX Graph object as well as features for machine learning. To create a StellarGraph object ready for machine learning, at a minimum pass the graph structure to the StellarGraph as a NetworkX graph: For undirected models:: Gs = StellarGraph(nx_graph) For directed models:: Gs = StellarDiGraph(nx_graph) To create a StellarGraph object with node features, supply the features as a numeric feature vector for each node. To take the feature vectors from a node attribute in the original NetworkX graph, supply the attribute name to the ``node_features`` argument:: Gs = StellarGraph(nx_graph, node_features="feature") where the nx_graph contains nodes that have a "feature" attribute containing the feature vector for the node. All nodes of the same type must have the same size feature vectors. Alternatively, supply the node features as Pandas DataFrame objects with the of the DataFrame set to the node IDs. For graphs with a single node type, you can supply the DataFrame object directly to StellarGraph:: node_data = pd.DataFrame( [feature_vector_1, feature_vector_2, ..], index=[node_id_1, node_id_2, ...]) Gs = StellarGraph(nx_graph, node_features=node_data) For graphs with multiple node types, provide the node features as Pandas DataFrames for each type separately, as a dictionary by node type. This allows node features to have different sizes for each node type:: node_data = { node_type_1: pd.DataFrame(...), node_type_2: pd.DataFrame(...), } Gs = StellarGraph(nx_graph, node_features=node_data) You can also supply the node feature vectors as an iterator of `node_id` and feature vector pairs, for graphs with single and multiple node types:: node_data = zip([node_id_1, node_id_2, ...], [feature_vector_1, feature_vector_2, ..]) Gs = StellarGraph(nx_graph, node_features=node_data) Args: node_type_name: str, optional (default=globals.TYPE_ATTR_NAME) This is the name for the node types that StellarGraph uses when processing heterogeneous graphs. StellarGraph will look for this attribute in the nodes of the graph to determine their type. edge_type_name: str, optional (default=globals.TYPE_ATTR_NAME) This is the name for the edge types that StellarGraph uses when processing heterogeneous graphs. StellarGraph will look for this attribute in the edges of the graph to determine their type. node_features: str, dict, list or DataFrame optional (default=None) This tells StellarGraph where to find the node feature information required by some graph models. These are expected to be a numeric feature vector for each node in the graph. """ def __init__(self, incoming_graph_data=None, **attr): # TODO: add doc string super().__init__(incoming_graph_data, **attr) # Names of attributes that store the type of nodes and edges self._node_type_attr = attr.get("node_type_name", globalvar.TYPE_ATTR_NAME) self._edge_type_attr = attr.get("edge_type_name", globalvar.TYPE_ATTR_NAME) # Names for the feature/target type (used if they are supplied and # feature/target spec not supplied" self._feature_attr = attr.get("feature_name", globalvar.FEATURE_ATTR_NAME) self._target_attr = attr.get("target_name", globalvar.TARGET_ATTR_NAME) # Ensure that the incoming graph data has node & edge types # TODO: This requires traversing all nodes and edges. Is there another way? # TODO: Should we add the default values as class arguments for these? node_types = set() type_for_node = {} for n, ndata in self.nodes(data=True): if self._node_type_attr not in ndata: ndata[self._node_type_attr] = "default" type_for_node[n] = ndata[self._node_type_attr] node_types.add(ndata[self._node_type_attr]) edge_types = set() for n1, n2, k, edata in self.edges(keys=True, data=True): if self._edge_type_attr not in edata: edata[self._edge_type_attr] = "default" edge_types.add(edata[self._edge_type_attr]) # New style: we are passed numpy arrays or pandas arrays of the feature vectors node_features = attr.get("node_features", None) dtype = attr.get("dtype", "float32") # If node_features is a string, load features from this attribute of the nodes in the graph if isinstance(node_features, str): data_index_maps, data_arrays = _convert_from_node_attribute( self, node_features, node_types, self._node_type_attr, dtype ) # Otherwise try importing node_features as a Numpy array or Pandas Dataframe elif node_features is not None: data_index_maps, data_arrays = _convert_from_node_data( node_features, type_for_node, node_types, dtype ) else: data_index_maps = {} data_arrays = {} # TODO: What other convenience attributes do we need? self._nodes_by_type = None # This stores the feature vectors per node type as numpy arrays self._node_attribute_arrays = data_arrays # This stores the map between node ID and index in the attribute arrays self._node_index_maps = data_index_maps def __repr__(self): directed_str = "Directed" if self.is_directed() else "Undirected" s = "{}: {} multigraph\n".format(type(self).__name__, directed_str) s += " Nodes: {}, Edges: {}\n".format( self.number_of_nodes(), self.number_of_edges() ) return s def fit_attribute_spec(self, *args, **kwargs): print("Fit attribute spec is deprecated") pass def check_graph_for_ml(self, features=True): """ Checks if all properties required for machine learning training/inference are set up. An error will be raised if the graph is not correctly setup. """ # TODO: This are simple tests and miss many problems that could arise, improve! # TODO: At this point perhaps we should generate features rather than in fit_attribute_spec # TODO: but if so how do we know whether to fit the attribute specs or not? # Check features on the nodes: if features and len(self._node_attribute_arrays) == 0: raise RuntimeError( "This StellarGraph has no numeric feature attributes for nodes" "Node features are required for machine learning" ) # How about checking the schema? def get_feature_for_nodes(self, nodes, node_type=None): """ Get the numeric feature vectors for the specified node or nodes. If the node type is not specified the node types will be found for all nodes. It is therefore important to supply the ``node_type`` for this method to be fast. Args: n: (list or hashable) Node ID or list of node IDs node_type: (hashable) the type of the nodes. Returns: Numpy array containing the node features for the requested nodes. """ # TODO: change method's name to node_features(), and add @property decorator if not is_real_iterable(nodes): nodes = [nodes] # Get the node type # TODO: This is slow, refactor so that self._node_index_maps gives the node type if node_type is None: node_types = { self.node[n].get(self._node_type_attr) for n in nodes if n is not None } if None in node_types: raise ValueError( "All nodes must have a type specified as the " "'{}' attribute.".format(self._node_type_attr) ) if len(node_types) > 1: raise ValueError("All nodes must be of the same type.") if len(node_types) == 0: raise ValueError( "At least one node must be given if node_type not specified" ) node_type = node_types.pop() # Check node_types if ( node_type not in self._node_attribute_arrays or node_type not in self._node_index_maps ): raise ValueError("Features not found for node type '{}'") # Edge case: if we are given no nodes, what do we do? if len(nodes) == 0: feature_size = self._node_attribute_arrays[node_type].shape[1] return np.empty((0, feature_size)) # Get index for nodes of this type nt_id_to_index = self._node_index_maps[node_type] node_indices = [nt_id_to_index.get(n) for n in nodes] if None in node_indices: raise ValueError( "Incorrect node specified or nodes of multiple types found." ) features = self._node_attribute_arrays[node_type][node_indices] return features def node_feature_sizes(self, node_types=None): """ Get the feature sizes for the specified node types. Args: node_types: (list) A list of node types. If None all current node types will be used. Returns: A dictionary of node type and integer feature size. """ # TODO: unit test! if not node_types: node_types = self.node_types self.check_graph_for_ml(features=True) fsize = {nt: self._node_attribute_arrays[nt].shape[1] for nt in node_types} return fsize def nodes_of_type(self, node_type=None): """ Get the nodes of the graph with the specified node types. Args: node_type: Returns: A list of node IDs with type node_type """ # TODO: unit test! if node_type is None: return list(self) else: return [ n for n, ndata in self.nodes(data=True) if ndata.get(self._node_type_attr) == node_type ] @property def node_types(self): """ Get a list of all node types in the graph. Returns: set of types """ # TODO: unit test! # TODO: create a schmea when we geenrate _node_attribute_arrays and use it? if len(self._node_attribute_arrays) > 0: return set(self._node_attribute_arrays.keys()) else: return { ndata.get(self._node_type_attr) for n, ndata in self.nodes(data=True) } def info(self, show_attributes=True, sample=None): """ Return an information string summarizing information on the current graph. This includes node and edge type information and their attributes. Note: This requires processing all nodes and edges and could take a long time for a large graph. Args: sample (int): To speed up the graph analysis, use only a random sample of this many nodes and edges. Returns: An information string. """ directed_str = "Directed" if self.is_directed() else "Undirected" s = "{}: {} multigraph\n".format(type(self).__name__, directed_str) s += " Nodes: {}, Edges: {}\n".format( self.number_of_nodes(), self.number_of_edges() ) # Sample the nodes for our analysis if sample: all_nodes = list(self.nodes) snodes = random.sample(all_nodes, sample) sedges = self.edges(snodes, keys=True) else: snodes = None sedges = None gs = self.create_graph_schema(create_type_maps=True, nodes=snodes, edges=sedges) # Go over all node types s += "\n Node types:\n" for nt in gs.node_types: # Filter nodes by type nt_nodes = [ ndata for n, ndata in self.nodes(data=True) if gs.get_node_type(n) == nt ] s += " {}: [{}]\n".format(nt, len(nt_nodes)) # Get the attributes for this node type attrs = set(it.chain(*[ndata.keys() for ndata in nt_nodes])) attrs.discard(self._node_type_attr) if show_attributes and len(attrs) > 0: s += " Attributes: {}\n".format(attrs) s += " Edge types: " s += ", ".join(["{}-{}->{}".format(*e) for e in gs.schema[nt]]) + "\n" s += "\n Edge types:\n" for et in gs.edge_types: # Filter edges by type et_edges = [ e[3] for e in self.edges(keys=True, data=True) if gs.get_edge_type(e[:3]) == et ] if len(et_edges) > 0: s += " {et[0]}-{et[1]}->{et[2]}: [{len}]\n".format( et=et, len=len(et_edges) ) # Get the attributes for this edge type attrs = set(it.chain(*[edata.keys() for edata in et_edges])) attrs.discard(self._edge_type_attr) if show_attributes and len(attrs) > 0: s += " Attributes: {}\n".format(attrs) return s def create_graph_schema(self, create_type_maps=True, nodes=None, edges=None): """ Create graph schema in dict of dict format from current graph. Note the assumption we make that there is only one edge of a particular edge type per node pair. This means that specifying an edge by node0, node1 and edge type is unique. Returns: GraphSchema object. """ if nodes is None: nodes = self.nodes() elif create_type_maps is True: raise ValueError( "Creating type mapes for subsampled nodes is not supported" ) if edges is None: edges = self.edges(keys=True) elif create_type_maps is True: raise ValueError( "Creating type mapes for subsampled edges is not supported" ) # Create node type index list node_types = sorted( {self.node[n].get(self._node_type_attr) for n in nodes}, key=str ) if None in node_types: raise ValueError( "All nodes should have a type set in the '{}' attribute.".format( self._node_type_attr ) ) graph_schema = {nt: set() for nt in node_types} # Create edge type index list edge_types = set() for n1, n2, k in edges: edata = self.adj[n1][n2][k] # Edge type tuple node_type_1 = self.node[n1][self._node_type_attr] node_type_2 = self.node[n2][self._node_type_attr] edge_type = edata[self._edge_type_attr] # Add edge type to node_type_1 data edge_type_tri = EdgeType(node_type_1, edge_type, node_type_2) edge_types.add(edge_type_tri) graph_schema[node_type_1].add(edge_type_tri) # Also add type to node_2 data if not digraph if not self.is_directed(): edge_type_tri = EdgeType(node_type_2, edge_type, node_type_1) edge_types.add(edge_type_tri) graph_schema[node_type_2].add(edge_type_tri) # Create ordered list of edge_types edge_types = sorted(edge_types) # Create keys for node and edge types schema = { node_label: [ edge_types[einx] for einx in sorted([edge_types.index(et) for et in list(node_data)]) ] for node_label, node_data in graph_schema.items() } # Create schema object gs = GraphSchema() gs._is_directed = self.is_directed() gs.edge_types = edge_types gs.node_types = node_types gs.schema = schema # Create quick type lookups for nodes and edges. # Note: we encode the type index, in the assumption it will take # less storage. if create_type_maps: node_type_map = { n: node_types.index(ndata[self._node_type_attr]) for n, ndata in self.nodes(data=True) } edge_type_map = { (edge[0], edge[1], edge[2]): edge_types.index( EdgeType( node_types[node_type_map[edge[0]]], edge[3][self._edge_type_attr], node_types[node_type_map[edge[1]]], ) ) for edge in self.edges(keys=True, data=True) } gs.node_type_map = node_type_map gs.edge_type_map = edge_type_map return gs class StellarGraph(StellarGraphBase, MultiGraph): def __init__(self, incoming_graph_data=None, **attr): super().__init__(incoming_graph_data, **attr) class StellarDiGraph(StellarGraphBase, MultiDiGraph): def __init__(self, incoming_graph_data=None, **attr): super().__init__(incoming_graph_data, **attr) PK!+'00stellargraph/core/schema.py# -*- coding: utf-8 -*- # # Copyright 2017-2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import queue from collections.__init__ import namedtuple from ..core.utils import is_real_iterable EdgeType = namedtuple("EdgeType", "n1 rel n2") class GraphSchema: """ Class to encapsulate the schema information for a heterogeneous graph. Typically this should be created from a StellarGraph object, using the create_graph_schema method. """ _is_directed = False node_types = None edge_types = None schema = None node_type_map = None edge_type_map = None def is_directed(self): return self._is_directed def node_index(self, name): """ Return node type index from the type name Args: index: name of the node type. Returns: Numerical node type index """ try: index = self.node_types.index(name) except ValueError: print("Warning: Node key '{}' not found.".format(name)) index = None return index def node_index_to_type(self, index): """ Return node type key from the numerical index Args: index: Numerical index of node type. Returns: Node type name """ try: key = self.node_types[index] except IndexError: print( "Warning: Node index '{}' invalid. Should be an integer between 0 and {}.".format( index, len(self.node_types) - 1 ) ) key = None return key def edge_index(self, edge_type): """ Return edge type index from the type tuple Args: index: Tuple of (node1_type, edge_type, node2_type) Returns: Numerical edge type index """ try: index = self.edge_types.index(edge_type) except ValueError: print("Warning: Edge key '{}' not found.".format(edge_type)) index = None return index def edge_index_to_type(self, index): """ Return edge type triple from the numerical index Args: index: Numerical index of edge type. Returns: Edge type triple """ try: key = self.edge_types[index] except IndexError: print( "Warning: Edge index '{}' invalid. Should be an integer between 0 and {}.".format( index, len(self.edge_types) - 1 ) ) key = None return key def __repr__(self): s = "{}:\n".format(type(self).__name__) for nt in self.schema: s += "node type: {}\n".format(nt) for e in self.schema[nt]: s += " {} -- {} -> {}\n".format(*e) return s def get_node_type(self, node, index=False): """ Returns the type of the node specified either by node ID. Args: node: The node ID from the original graph index: Return a numeric type index if True, otherwise return the type name. Returns: A node type name or index """ # TODO: remove "get_" from the name try: nt = self.node_type_map[node] node_type = nt if index else self.node_types[nt] except IndexError: print("Warning: Node '{}' not found in type map.".format(node)) node_type = None return node_type def is_of_edge_type(self, edge, edge_type, index=False): """ Tests if an edge is of the given edge type. The edge is specified as a standard NetworkX multigraph edge triple of (node_id_1, node_id_2, edge_key). If the graph schema is undirected then the ordering of the nodes of the edge type doesn't matter. Args: edge: The edge ID from the original graph as a triple. edge_type: The type of the edge as a tuple or EdgeType triple. Returns: True if the edge is of the given type """ try: if edge in self.edge_type_map: eindex = self.edge_type_map[edge] elif not self.is_directed(): eindex = self.edge_type_map[(edge[1], edge[0], edge[2])] else: raise IndexError et = self.edge_types[eindex] if self.is_directed(): match = et == edge_type else: match = (et == edge_type) or ( et == (edge_type[2], edge_type[1], edge_type[0]) ) except IndexError: print("Warning: Edge '{}' not found in type map.".format(edge)) match = False return match def get_edge_type(self, edge, index=False): """ Return the type of the edge as a triple of (source_node_type, relation_type, dest_node_type). The edge is specified as a standard NetworkX multigraph edge triple of (node_id_1, node_id_2, edge_key). If the graph schema is undirected and there is an edge type for the edge (node_id_2, node_id_1, edge_key) then the edge type for this node will be returned permuted to match the node order. Args: edge: The edge ID from the original graph as a triple. index: Return a numeric type index if True, otherwise return the type triple. Returns: A node type triple or index. """ # TODO: remove "get_" from the name try: if edge in self.edge_type_map: et = self.edge_type_map[edge] edge_type = et if index else self.edge_types[et] elif not self.is_directed(): et = self.edge_type_map[(edge[1], edge[0], edge[2])] if index: edge_type = et else: et = self.edge_types[et] edge_type = EdgeType(et[2], et[1], et[0]) else: raise IndexError except IndexError: print("Warning: Edge '{}' not found in type map.".format(edge)) edge_type = None return edge_type def sampling_tree(self, head_node_types, n_hops): """ Returns a sampling tree for the specified head node types for neighbours up to n_hops away. A unique ID is created for each sampling node. Args: head_node_types: An iterable of the types of the head nodes n_hops: The number of hops away Returns: A list of the form [(type_adjacency_index, node_type, [children]), ...] where children are (type_adjacency_index, node_type, [children]) """ adjacency_list = self.type_adjacency_list(head_node_types, n_hops) def pack_tree(nodes, level): return [ (n, adjacency_list[n][0], pack_tree(adjacency_list[n][1], level + 1)) for n in nodes ] # The first k nodes will be the head nodes in the adjacency list # TODO: generalize this? return adjacency_list, pack_tree(range(len(head_node_types)), 0) def sampling_layout(self, head_node_types, num_samples): """ For a sampling scheme with a list of head node types and the number of samples per hop, return the map from the actual sample index to the adjacency list index. Args: head_node_types: A list of node types of the head nodes. num_samples: A list of integers that are the number of neighbours to sample at each hop. Returns: A list containing, for each head node type, a list consisting of tuples of (node_type, sampling_index). The list matches the list given by the method `type_adjacency_list(...)` and can be used to reformat the samples given by `SampledBreadthFirstWalk` to that expected by the HinSAGE model. """ adjacency_list = self.type_adjacency_list(head_node_types, len(num_samples)) sample_index_layout = [] sample_inverse_layout = [] # The head nodes are the first K nodes in the adjacency list # TODO: generalize this? for ii, hnt in enumerate(head_node_types): adj_to_samples = [(adj[0], []) for adj in adjacency_list] # The head nodes will be the first sample in the appropriate # sampling list, and the ii-th in the adjacenecy list sample_to_adj = {0: ii} adj_to_samples[ii][1].append(0) # Set the start group as the head node and point the index to the next hop node_groups = [(ii, hnt)] sample_index = 1 # Iterate over all hops for jj, nsamples in enumerate(num_samples): next_node_groups = [] for a_key, nt1 in node_groups: # For each node we sample from all edge types from that node edge_types = self.schema[nt1] # We want to place the samples for these edge types in the correct # place in the adjacency list next_keys = adjacency_list[a_key][1] for et, next_key in zip(edge_types, next_keys): # These are psueo-samples for each edge type sample_types = [(next_key, et.n2)] * nsamples next_node_groups.extend(sample_types) # Store the node type, adjacency and sampling indices sample_to_adj[sample_index] = next_key adj_to_samples[next_key][1].append(sample_index) sample_index += 1 # Sanity check assert adj_to_samples[next_key][0] == et.n2 node_groups = next_node_groups # Add samples to layout and inverse layout sample_index_layout.append(sample_to_adj) sample_inverse_layout.append(adj_to_samples) return sample_inverse_layout def type_adjacency_list(self, head_node_types, n_hops): """ Creates a BFS sampling tree as an adjacency list from head node types. Each list element is a tuple of:: (node_type, [child_1, child_2, ...]) where ``child_k`` is an index pointing to the child of the current node. Note that the children are ordered by edge type. Args: head_node_types: Node types of head nodes. n_hops: How many hops to sample. Returns: List of form ``[ (node_type, [children]), ...]`` """ if not isinstance(head_node_types, (list, tuple)): raise TypeError("The head node types should be a list or tuple.") if not isinstance(n_hops, int): raise ValueError("n_hops should be an integer") to_process = queue.Queue() # Add head nodes clist = list() for ii, hn in enumerate(head_node_types): if n_hops > 0: to_process.put((hn, ii, 0)) clist.append((hn, [])) while not to_process.empty(): # Get node, node index, and level nt, ninx, lvl = to_process.get() # The ordered list of edge types from this node type ets = self.schema[nt] # Iterate over edge types (in order) for et in ets: cinx = len(clist) clist.append((et.n2, [])) clist[ninx][1].append(cinx) if n_hops > lvl + 1: to_process.put((et.n2, cinx, lvl + 1)) return clist PK!"stellargraph/core/utils.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import collections def is_real_iterable(x): """ Tests if x is an iterable and is not a string. Args: x: Returns: True if x is an iterable (but not a string) and False otherwise """ return isinstance(x, collections.Iterable) and not isinstance(x, (str, bytes)) PK!nnstellargraph/data/__init__.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ The data package contains classes and functions to read, process, and query graph data """ # Expose the stellargraph.data classes: from .explorer import * from .edge_splitter import * from .node_splitter import * from .loader import from_epgm, load_dataset_BlogCatalog3 PK!QQstellargraph/data/converter.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from abc import ABC, abstractmethod import numpy as np from keras.utils.np_utils import to_categorical from stellargraph.core.graph import StellarGraphBase class NodeAttributeSpecification: """ This class converts numeric and non-numeric node attributes to the appropriate numeric vectors for machine learning. In the StellarML library, all machine learning tasks that require feature and target attribute specifications should be passed an object of this class. # Usage Instantiation:: nfs = NodeAttributeSpecification() To add an attribute for a node type, choose an appropriate Converter class and use the following methods: For a single attribute of node type node_type use `add_attribute`:: nfs.add_attribute(node_type, attribute_name, Converter, ) For multiple attributes of a single node type, using a single converter class, use `add_attribute_list`:: nfs.add_attribute_list(node_type, attribute_name, Converter, ) To add all attributes using the same converter class, use `add_all_attributes`, you will need to provide a StellarGraph object so that the node attributes can be extracted:: nfs.add_all_attributes(graph, node_type, Converter, ) # Converter classes: There are multiple converter classes that can be used depending upon the attribute values and whether the attribute specification required is for features or targets. * BinaryConverter: This converter will create a value with a one if the attribute exists in the node attributes and zero if it does not. * CategorigalConverter: This converter takes an attribute that has multiple values (categories) and converts the categories to integers. * OneHotCategorigalConverter: This converter takes an attribute that has multiple values (categories) and converts the categories to one-hot vectors of length equal to the number of categories. * NumericConverter: This converter takes an attribute that has integer or floating point values and optionally normalizes them by mean and standard deviation. More inforamtion on these converters and their parameters can be found in their individual documentation. Also note that the converter parameters should be passed to the attribute specification methods, not directly to the converter. """ def __init__(self): self._node_specs = {} self._node_feature_specs = {} def add_attribute(self, node_type, attr, converter, **conv_args): """ Add a named attribute with specified converter for a node type Args: node_type: Node type that contains the attribute (must be specified, even if there is only a single node type) attr: Attribute name converter: Converter class (this should be the class, not an object) **conv_args: Optional arguemnts to the converter, specific to converter """ if not issubclass(converter, StellarAttributeConverter): raise TypeError( "Converter should be a subclass of StellarAttributeConverter" ) node_type_spec = self._node_specs.get(node_type, {}) node_type_spec[attr] = converter(**conv_args) self._node_specs[node_type] = node_type_spec def add_attribute_list(self, node_type, attrs, converter, **conv_args): """ Add multiple named attributes with the specified converter, note that an individual converter object will be created for each attribute. Args: node_type: Node type that contains the attribute names (must be specified, even if there is only a single node type) attrs: List of attribute names to use) converter: Converter class (this should be the class, not an object) **conv_args: Optional arguments to the converter, specific to converter """ if not issubclass(converter, StellarAttributeConverter): raise TypeError( "Converter should be a subclass of StellarAttributeConverter" ) node_type_spec = self._node_specs.get(node_type, {}) for attr in attrs: node_type_spec[attr] = converter(**conv_args) self._node_specs[node_type] = node_type_spec def add_all_attributes( self, graph, node_type, converter, ignored_attributes=[], **conv_args ): """ Add multiple named attributes with the specified converter to all attributes of the given node type found in the graph. Args: graph: A StellarGraph object containing nodes of the specified type. node_type: Node type that contains the attribute names (must be specified, even if there is only a single node type) converter: Converter class (this should be the class, not an object) ignored_attributes: (Optional) a list of attribute names to not include. **conv_args: Optional arguments to the converter, specific to converter """ if not issubclass(converter, StellarAttributeConverter): raise TypeError( "Converter should be a subclass of StellarAttributeConverter" ) if not isinstance(graph, StellarGraphBase): raise TypeError("Graph should be a StellarGraph") # Go through graph to find node attributes all_attrs = set( k for v in graph.nodes_of_type(node_type) for k in graph.node[v].keys() ) # Remove any ignored attributes attrs = all_attrs.difference(set(ignored_attributes)) # Don't use node type as attribute: attrs.discard(graph._node_type_attr) # Set found attributes with converter self.add_attribute_list(node_type, attrs, converter, **conv_args) def has_type(self, node_type): """ Returns True if the specified type exists in the attribute specification Args: node_type: String specifying the node type Returns: A bool specifying if the node type exists. """ return node_type in self._node_specs def get_types(self): """ Returns a list of the node types in this attribute specification """ return list(self._node_specs.keys()) def get_attributes(self, node_type=None): """ Get the list of attributes in a defined order for the given node type. Args: node_type: Node type key, if None and there is a single node type the attributes of that type are returned. Returns: List of attribute IDs """ if node_type is None: if len(self._node_specs) == 1: node_attrs = next(iter(self._node_specs.values())).keys() else: raise RuntimeError( "Please specify the node type when there are multiple node types" ) elif node_type in self._node_specs: node_attrs = self._node_specs[node_type].keys() else: raise ValueError( "There are no nodes of type '{}' set as targets".format(node_type) ) return sorted(node_attrs, key=str) def get_feature_indices(self, node_type): """ Gives the ranges of the indices in the numeric vector corresponding to each attribute the specification. Args: node_type: The node type Returns: A dictionary of attribute index ranges in the form: ``` { attribute_jj : (start_index, end_index) ... } ``` """ if node_type not in self._node_specs: return {} node_type_spec = self._node_specs[node_type] feature_list = sorted(node_type_spec.keys(), key=str) # Run over sorted array and map attribute to # range of values in the feature start_ind = 0 feature_id_to_range = {} for attr in feature_list: conv = node_type_spec[attr] end_ind = start_ind + len(conv) feature_id_to_range[attr] = (start_ind, end_ind) start_ind = end_ind return feature_id_to_range def get_converter(self, node_type, attr): """ Get the converter object for the specified node type and attribute name Args: node_type: Node type attr: Attribute name Returns: The converter object """ if node_type not in self._node_specs: raise KeyError("Node type '{}' not in known node types.".format(node_type)) if attr not in self._node_specs[node_type]: raise KeyError( "Attribute '{}' not known for node type {}.".format(attr, node_type) ) return self._node_specs[node_type][attr] def get_output_size(self, node_type=None): """ Get the size of the output vector for the node_type Args: node_type: The node type Returns: An integer specifying the vector length for this node type """ if node_type is None: if len(self._node_specs) == 1: node_type = next(iter(self._node_specs.keys())) else: raise ValueError( "Node type must be specified if there are multiple node types" ) elif node_type not in self._node_specs: raise ValueError( "Node type '{}' not found in attribute specification.".format(node_type) ) return np.sum([len(conv) for conv in self._node_specs[node_type].values()]) def fit_transform(self, node_type, data): """ Fit the converters for the given node type to the data and convert the data to output vectors. Args: node_type: The node type data: A list of dictionaries containing attribute names and values Returns: A numpy array containing the values of the converted attributes, of shape (length of data, output size) """ n_data = len(data) # Convert attribute data to numeric values for each attribute converted_features = {} attr_list = self.get_attributes(node_type) for attr_name in attr_list: attr_data = [d.get(attr_name) for d in data] conv = self.get_converter(node_type, attr_name) converted_features[attr_name] = conv.fit_transform(attr_data) # Store features in array feature_array = np.concatenate( [ np.reshape(converted_features[attr_name], (n_data, -1)) for attr_name in attr_list ], axis=1, ) return feature_array def transform(self, node_type, data): """ Convert the supplied data to numeric vectors, this assumes that the converters have previously been trained. Args: node_type: The node type data: A list of dictionaries containing attribute names and values Returns: A numpy array containing the values of the converted attributes, of shape (length of data, output size) """ n_data = len(data) # Convert attribute data to numeric values for each attribute converted_features = {} attr_list = self.get_attributes(node_type) for attr_name in attr_list: attr_data = [d.get(attr_name) for d in data] conv = self.get_converter(node_type, attr_name) converted_features[attr_name] = conv.transform(attr_data) # Store features in array feature_array = np.concatenate( [ np.reshape(converted_features[attr_name], (n_data, -1)) for attr_name in attr_list ], axis=1, ) return feature_array def inverse_transform(self, node_type, data): """ Convert the supplied numeric vectors back to the form of the original data. Args: node_type: The node type data: A numpy array of numeric data. Returns: A list containing the input attributes. """ n_data = len(data) # The indices in the transformed vector for each attribute indices_for_attr = self.get_feature_indices(node_type) # Convert numeric values to the original domain for each attribute converted_features = {} attr_list = self.get_attributes(node_type) for attr_name in attr_list: conv = self.get_converter(node_type, attr_name) assert conv is not None assert attr_name in indices_for_attr # Extract data for this attribute index_range = indices_for_attr[attr_name] attr_data = data[:, index_range[0] : index_range[1]] converted_features[attr_name] = conv.inverse_transform(attr_data) # Convert to a list attr_out = [ {attr_name: converted_features[attr_name][ii] for attr_name in attr_list} for ii in range(n_data) ] return attr_out class StellarAttributeConverter(ABC): """ Abstract class for attribute converters. """ @abstractmethod def __len__(self): pass @abstractmethod def fit_transform(self): pass @abstractmethod def transform(self): pass @abstractmethod def inverse_transform(self): pass class NumericConverter(StellarAttributeConverter): """ This converter takes an attribute that has integer or floating point values and optionally normalizes them by mean and standard deviation. Args: dtype: (Optional) convert to a vector of this numpy data type default_value: (Optional) if the attribute is missing, if this is "mean" (default) assign the mean value calculated over the valid data, if this is a float or int, assign that value directly. normalize: (Optional) if this is "standard" normalize the values by shifting and scaling the values so that the mean is zero and the standard deviation is one. """ def __init__(self, dtype="float32", default_value="mean", normalize="standard"): self.dtype = dtype self.normalize = normalize self.default_value = default_value def __len__(self): # TODO: extend this to multiple values return 1 def fit_transform(self, data): data = np.asarray(data, dtype=self.dtype) # Calculate normalization parameters if self.normalize == "standard": self.scale = np.nanstd(data, axis=0) self.offset = np.nanmean(data, axis=0) else: self.scale = 1 self.offset = 0 if self.scale < 1e-6: raise ValueError( "When trying to normalize the data, the standard deviation close to zero." ) return self.transform(data) def transform(self, data): data = np.asarray(data, dtype=self.dtype) # Normalization if self.normalize == "standard": data = (data - self.offset) / self.scale # Fill missing values if self.default_value == "mean": fill_value = np.nanmean(data) elif self.default_value == "median": fill_value = np.nanmedian(data) elif np.isscalar(self.default_value): fill_value = self.default_value data = np.where(np.isfinite(data), data, fill_value) return data def inverse_transform(self, data): data = np.asanyarray(data) # De-normalization if self.normalize == "standard": data = data * self.scale + self.offset # We can't un-fill missing values! return np.squeeze(data) class CategoricalConverter(StellarAttributeConverter): """ This converter takes an attribute that has multiple values (categories) and converts the categories to integers. Args: default_value: Value to assign to the vector output when the attribute is missing. dtype: (Optional) convert to a vector of this numpy data type """ def __init__(self, default_value=0, dtype="float32"): self.default_value = default_value self.dtype = dtype self.categories = [] def __len__(self): return 1 def fit_transform(self, data): self.categories = sorted(set(data), key=str) return self.transform(data) def transform(self, data): # TODO: Checks for data input return np.array( [ self.categories.index(d) if d is not None else self.default_value for d in data ], dtype=self.dtype, ) def inverse_transform(self, data): # TODO: Checks for data input return [self.categories[int(ii)] for ii in data] class OneHotCategoricalConverter(StellarAttributeConverter): """ This converter takes an attribute that has multiple values (categories) and converts the categories to one-hot vectors of length equal to the number of categories. Args: default_value: (Optional) value to assign to the vector output when the attribute is missing. without_first: (Optional) Return a vector that omits the first value, so is zero when the first category is supplied. This can be useful for inputs to DL systems. dtype: (Optional) convert to a vector of this numpy data type """ def __init__(self, default_value=0, without_first=False, dtype="float32"): self.default_value = default_value self.without_first = without_first self.dtype = dtype self.categories = [] def fit_transform(self, data): self.categories = sorted(set(data), key=str) if len(self.categories) == 1: print("Warning: Only one category for attribute") return self.transform(data) def __len__(self): if self.without_first: size = len(self.categories) - 1 else: size = len(self.categories) return size def transform(self, data): data_cats = [ self.categories.index(d) if d is not None else self.default_value for d in data ] # Otherwise use the Keras to_categorical function data_trans = to_categorical(data_cats, len(self.categories)).astype(self.dtype) # If the without_first is set, remove the first value if self.without_first: data_trans = data_trans[:, 1:] return data_trans def inverse_transform(self, data): data = np.asanyarray(data) assert np.ndim(data) == 2 # Get an integer category, adding one if we have without_first=True category_id = np.argmax(data, axis=1) if self.without_first: category_id = (category_id + 1) * np.any(data, axis=1).astype(int) return [self.categories[ii] for ii in category_id] class BinaryConverter(StellarAttributeConverter): """ This converter will create a value with a one if the attribute exists in the node attributes and zero if it does not. Args: default_value: Value to assign to the vector output when the attribute is missing. dtype: (Optional) convert to a vector of this numpy data type """ def __init__(self, dtype="float32", default_value=0): self.dtype = dtype self.default_value = default_value def __len__(self): return 1 def fit_transform(self, data): return self.transform(data) def transform(self, data): data_bool = [ bool(d) if d is not None else bool(self.default_value) for d in data ] return np.asarray(data_bool, dtype=self.dtype) def inverse_transform(self, data): return [None if d == 0 else 1 for d in data] PK!~"stellargraph/data/edge_splitter.py# -*- coding: utf-8 -*- # # Copyright 2017-2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. __all__ = ["EdgeSplitter"] import datetime import networkx as nx import pandas as pd import numpy as np from math import isclose class EdgeSplitter(object): """ Class for generating training and test data for link prediction in graphs. The class requires as input a graph (in netowrkx format) and a percentage as a function of the total number of edges in the given graph of the number of positive and negative edges to sample. For heterogeneous graphs, the caller can also specify the type of edge and an edge property to split on. In the latter case, only a date property can be used and it must be in the format dd/mm/yyyy. A date to be used as a threshold value such that only edges that have date after the threshold must be given. This effects only the sampling of positive edges. Negative edges are sampled at random by uniformly (for 'global' method) selecting two nodes in the graph and then checking if these edges are connected or not. If not, the pair of nodes is considered a negative sample. Otherwise, it is discarded and the process repeats. Alternatively, negative edges are sampled (for 'local' method) using DFS search at a distance from the source node (selected uniformly at random from all nodes in the graph) sampled according to a given set of probabilities. Positive edges are sampled such that the original graph remains connected. This is achieved by first calculating the minimum spanning tree. The edges in the minimum spanning tree cannot be removed, i.e., selected as positive training edges. The remaining edges, not those on the minimum spanning tree are sampled uniformly at random until either the maximum number of edges that can be sampled up to the required number are sampled or the required number of edges have been sampled as positive examples. """ def __init__(self, g, g_master=None): # the original graph copied over self.g = g.copy() self.g_master = g_master # placeholder: it will hold the subgraph of self.g after edges are removed as positive training samples self.g_train = None self.positive_edges_ids = None self.positive_edges_labels = None self.negative_edges_ids = None self.negative_edges_labels = None self.negative_edge_node_distances = None self.minedges = None # the minimum spanning tree as a list of edges. self.minedges_set = None # lookup dictionary for edges in minimum spanning tree self._random = None def _train_test_split_homogeneous(self, p, method, probs=None): """ Method for edge splitting applied to homogeneous graphs. Args: p: Number of positive and negative examples calculated as p* method: Should be 'global' or 'local'. Specifies the method for selecting negative examples. probs: If method is 'local' then this vector of floats specifies the probabilities for sampling at each depth from the source node. The first value should be 0.0 and all values should sum to 1.0. Returns: 2 numpy arrays, the first Nx2 holding the node ids for the edges and the second Nx1 holding the edge labels, 0 for negative and 1 for positive example. """ # minedges are those edges that if removed we might end up with a disconnected graph after the positive edges # have been sampled. self.minedges = self._get_minimum_spanning_edges() # Sample the positive examples positive_edges = self._reduce_graph(minedges=self.minedges_set, p=p) df = pd.DataFrame(positive_edges) self.positive_edges_ids = np.array(df.iloc[:, 0:2]) self.positive_edges_labels = np.array(df.iloc[:, 2]) if method == "global": negative_edges = self._sample_negative_examples_global( p=p, limit_samples=len(positive_edges) ) else: # method == 'local' if probs is None: # use default values if not given, by warn user probs = [0.0, 0.25, 0.50, 0.25] print( "WARNING: Using default sampling probabilities (distance from source node): {}".format( probs ) ) negative_edges = self._sample_negative_examples_local_dfs( p=p, probs=probs, limit_samples=len(positive_edges) ) df = pd.DataFrame(negative_edges) self.negative_edges_ids = np.array(df.iloc[:, 0:2]) self.negative_edges_labels = np.array(df.iloc[:, 2]) if len(self.positive_edges_ids) == 0: raise Exception("Could not sample any positive edges") if len(self.negative_edges_ids) == 0: raise Exception("Could not sample any negative edges") edge_data_ids = np.vstack((self.positive_edges_ids, self.negative_edges_ids)) edge_data_labels = np.hstack( (self.positive_edges_labels, self.negative_edges_labels) ) print( "** Sampled {} positive and {} negative edges. **".format( len(self.positive_edges_ids), len(self.negative_edges_ids) ) ) return edge_data_ids, edge_data_labels def _train_test_split_heterogeneous( self, p, method, edge_label, probs=None, edge_attribute_label=None, edge_attribute_threshold=None, ): """ Splitting edge data based on edge type or edge type and edge property. The edge property must be a date in the format dd/mm/yyyy. If splitting by date, then a threshold value must also be given such that only edges with date larger than the threshold can be in the set of positive examples. The edge property does not effect the sampling of negative examples. Args: p: Number of positive and negative examples calculated as p* method: Should be 'global' or 'local'. Specifies the method for selecting negative examples. edge_label: The edge type to split on probs: If method=='local' then this vector of floats specifies the probabilities for sampling at each depth from the source node. The first value should be 0.0 and all values should sum to 1.0. edge_attribute_label: The label for the edge attribute to split on edge_attribute_threshold: The threshold value applied to the edge attribute when sampling positive examples Returns: 2 numpy arrays, the first Nx2 holding the node ids for the edges and the second Nx1 holding the edge labels, 0 for negative and 1 for positive example. """ # minedges are those edges that if removed we might end up with a disconnected graph after the positive edges # have been sampled. self.minedges = self._get_minimum_spanning_edges() # Note: The caller guarantees the edge_label is not None so we don't have to check here again. if edge_attribute_threshold is None: positive_edges = self._reduce_graph_by_edge_type( minedges=self.minedges_set, p=p, edge_label=edge_label ) else: positive_edges = self._reduce_graph_by_edge_type_and_attribute( minedges=self.minedges_set, p=p, edge_label=edge_label, edge_attribute_label=edge_attribute_label, edge_attribute_threshold=edge_attribute_threshold, ) if len(positive_edges) == 0: raise Exception( "ERROR: Unable to sample any positive edges of type '{}'".format( edge_label ) ) df = pd.DataFrame(positive_edges) self.positive_edges_ids = np.array(df.iloc[:, 0:2]) self.positive_edges_labels = np.array(df.iloc[:, 2]) if method == "global": negative_edges = self._sample_negative_examples_by_edge_type_global( p=p, edges=positive_edges, edge_label=edge_label, limit_samples=len(positive_edges), ) else: # method == 'local' if probs is None: probs = [0.0, 0.25, 0.50, 0.25] print( "WARNING: Using default sampling probabilities (distance from source node): {}".format( probs ) ) negative_edges = self._sample_negative_examples_by_edge_type_local_dfs( p=p, probs=probs, edges_positive=positive_edges, edge_label=edge_label, limit_samples=len(positive_edges), ) df = pd.DataFrame(negative_edges) self.negative_edges_ids = np.array(df.iloc[:, 0:2]) self.negative_edges_labels = np.array(df.iloc[:, 2]) if len(self.positive_edges_ids) == 0: raise Exception("Could not sample any positive edges") if len(self.negative_edges_ids) == 0: raise Exception("Could not sample any negative edges") edge_data_ids = np.vstack((self.positive_edges_ids, self.negative_edges_ids)) edge_data_labels = np.hstack( (self.positive_edges_labels, self.negative_edges_labels) ) print( "** Sampled {} positive and {} negative edges. **".format( len(self.positive_edges_ids), len(self.negative_edges_ids) ) ) return edge_data_ids, edge_data_labels def train_test_split( self, p=0.5, method="global", probs=None, edge_label=None, edge_attribute_label=None, edge_attribute_threshold=None, attribute_is_datetime=None, seed=None, ): """ Generates positive and negative edges and a graph that has the same nodes as the original but the positive edges removed. It can be used to generate data from homogeneous and heterogeneous graphs. For heterogeneous graphs, positive and negative examples can be generated based on specified edge type or edge type and edge property given a threshold value for the latter. Args: p: Percent of edges to be generated as a function of the total number of edges in the original graph method: How negative edges are sampled. If 'global', then nodes are selected uniformaly at random. If 'local' then the first nodes is sampled uniformly from all nodes in the graph, but the second node is chosen to be from the former's local neighbourhood. probs: list The probabilities for sampling a node that is k-hops from the source node, e.g., [0.25, 0.75] means that there is a 0.25 probability that the target node will be 1-hope away from the source node and 0.75 that it will be 2 hops away from the source node. This only affects sampling of negative edges if method is set to 'local'. edge_label: If splitting based on edge type, then this parameter specifies the key for the type of edges to split on. edge_attribute_label: The label for the edge attribute to split on. edge_attribute_threshold: The threshold value applied to the edge attribute when sampling positive examples. attribute_is_datetime: Specifies if edge attribute is datetime or not. seed: seed for random number generator, positive int or 0 Returns: The reduced graph (positive edges removed) and the edge data as 2 numpy arrays, the first Nx2 holding the node ids for the edges and the second Nx1 holding the edge labels, 0 for negative and 1 for positive example. """ if p <= 0 or p >= 1: raise ValueError("The value of p must be in the interval (0,1)") if method != "global" and method != "local": raise ValueError( "Invalid method {}; valid options are 'local' or 'global'".format( method ) ) if seed is not None: if seed < 0: raise ValueError( "({}) The random number generator seed value, seed, should be positive integer or None.".format( type(self).__name__ ) ) if type(seed) != int: raise ValueError( "({}) The random number generator seed value, seed, should be integer type or None.".format( type(self).__name__ ) ) if self._random is None: # only do this one self._random = np.random.RandomState(seed=seed) if edge_label is not None: # working with a heterogeneous graph if ( edge_attribute_label and edge_attribute_threshold and not attribute_is_datetime ): raise ValueError("You can only split by datetime edge attribute") else: # all three are True edge_data_ids, edge_data_labels = self._train_test_split_heterogeneous( p=p, method=method, edge_label=edge_label, edge_attribute_label=edge_attribute_label, edge_attribute_threshold=edge_attribute_threshold, ) else: # working with a homogeneous graph edge_data_ids, edge_data_labels = self._train_test_split_homogeneous( p=p, method=method, probs=probs ) return self.g_train, edge_data_ids, edge_data_labels def _get_edges( self, edge_label, edge_attribute_label=None, edge_attribute_threshold=None ): """ Method that filters the edges in the self.g (heterogeneous) graph based on either the edge type specified by edge_label, or based on edges of edge_label type that have property edge_attribute_label and the value of the latter property is larger than the edge_attribute_threshold. Args: edge_label: The type of edges to filter edge_attribute_label: The edge attribute to use for filtering graph edges edge_attribute_threshold: The threshold applied to the edge attribute for filtering edges. Returns: List of edges that satisfy the filtering criteria. """ # the graph in networkx format is stored in self.g_train if self.g.is_multigraph(): all_edges = list(self.g.edges(keys=True)) else: all_edges = list(self.g.edges()) if edge_attribute_label is None or edge_attribute_threshold is None: # filter by edge_label edges_with_label = [ e for e in all_edges if self.g.get_edge_data(*e)["label"] == edge_label ] elif ( edge_attribute_threshold is not None and edge_attribute_threshold is not None ): # filter by edge label, edge attribute and threshold value edge_attribute_threshold_dt = datetime.datetime.strptime( edge_attribute_threshold, "%d/%m/%Y" ) edges_with_label = [ e for e in all_edges if ( self.g.get_edge_data(*e)["label"] == edge_label and datetime.datetime.strptime( self.g.get_edge_data(*e)[edge_attribute_label], "%d/%m/%Y" ) > edge_attribute_threshold_dt ) ] else: raise ValueError("Invalid parameters") # not the most informative error! return edges_with_label def _get_edge_source_and_target_node_types(self, edges): """ Method that given a list of edges, for each edge it determines the type of the source and target nodes and then returns them as a list of tuples. This routine is necessary because networkx does not provide a direct method for determining the type of nodes given an edge. Args: edges: List of edges as returned by networkx graph method edges() Returns: Returns a list of 2-tuples such that each value in the tuple holds the type (as str) of the source and target nodes for each element in edges. """ # uses self.g_train but any graph object would do since nodes are shared all_nodes = self.g_train.nodes(data=True) # dictionary that maps node id to node attributes all_nodes_as_dict = {n[0]: n[1] for n in all_nodes} edge_node_types = set() for edge in edges: edge_node_types.add( ( all_nodes_as_dict[edge[0]]["label"], all_nodes_as_dict[edge[1]]["label"], ) ) return edge_node_types def _reduce_graph_by_edge_type_and_attribute( self, minedges, p=0.5, edge_label=None, edge_attribute_label=None, edge_attribute_threshold=None, ): """ Reduces the graph self.g_train by a factor p by removing existing edges not on minedges list such that the reduced tree remains connected. Edges are removed based on the edge type and the values of a given edge attribute and a threshold applied to the latter. Args: minedges: Spanning tree edges that cannot be removed p: Factor by which to reduce the size of the graph edge_label: The edge type to consider edge_attribute_label: The edge attribute to consider edge_attribute_threshold: The threshold value; only edges with attribute value larger than the threshold can be removed Returns: Returns the list of edges removed from the graph (also modifies the graph self.g_train by removing the said edges) """ # We check that the parameters are given values but we don't check if the graph has edges with label # edge_label and edge attributes with label edge_attribute_label. For now, we assume that the given values # are valid; if not, then some cryptic exception is bound to be raised later on in the code. if edge_label is None: raise ValueError("edge_label must be specified.") if edge_attribute_label is None: raise ValueError("edge_attribute_label must be specified.") if edge_attribute_threshold is None: raise ValueError("attribute_threshold must be specified.") # copy the original graph and start over in case this is not the first time # reduce_graph has been called. self.g_train = self.g.copy() # Filter the graph's edges based on the edge type, edge attribute, and attribute threshold value given. all_edges = self._get_edges( edge_label=edge_label, edge_attribute_label=edge_attribute_label, edge_attribute_threshold=edge_attribute_threshold, ) # Also, calculate the number of these edges in the graph. num_edges_total = len(all_edges) # print("Graph has {} edges of type {}".format(num_edges_total, edge_label)) # Multiply this number by p to determine the number of positive edge examples to sample num_edges_to_remove = int(num_edges_total * p) # shuffle the edges self._random.shuffle(all_edges) # # iterate over the list of edges and for each edge if the edge is not in minedges, remove it from the graph # until num_edges_to_remove edges have been removed and the graph reduced to p of its original size count = 0 removed_edges = [] for edge in all_edges: # Support minedges having keys (NetworkX 2.x) or not (NetworkX 1.x) if edge not in minedges and (edge[0], edge[1]) not in minedges: removed_edges.append( ( edge[0], edge[1], 1, ) # should this be edge + (1,) to support multigraphs? ) # the last entry is the label self.g_train.remove_edge(*edge) count += 1 if count % 1000 == 0: print("Removed", count, "edges") if count == num_edges_to_remove: return removed_edges return removed_edges def _reduce_graph_by_edge_type(self, minedges, p=0.5, edge_label=None): """ Reduces the graph self.g_train by a factor p by removing existing edges not on minedges list such that the reduced tree remains connected. Edges are removed based on the edge type. Args: minedges: Minimum spanning tree edges that cannot be removed p: Factor by which to reduce the size of the graph edge_label: The edge type to consider Returns: Returns the list of edges removed from self.g_train (also modifies self.g_train by removing said edges) """ if edge_label is None: raise ValueError("edge_label must be specified") # copy the original graph and start over in case this is not the first time # reduce_graph has been called. self.g_train = self.g.copy() # Filter the graph's edges based on the specified edge_label all_edges = self._get_edges(edge_label=edge_label) num_edges_total = len(all_edges) print("Network has {} edges of type {}".format(num_edges_total, edge_label)) # Multiply this number by p to determine the number of positive edge examples to sample num_edges_to_remove = int(num_edges_total * p) # shuffle the edges self._random.shuffle(all_edges) # iterate over the list of filtered edges and for each edge if the edge is not in minedges, remove it from # the graph until num_edges_to_remove edges have been removed and the graph is reduced to p of its original # size count = 0 removed_edges = [] for edge in all_edges: # Support minedges having keys (NetworkX 2.x) or not (NetworkX 1.x) if edge not in minedges and (edge[0], edge[1]) not in minedges: removed_edges.append( (edge[0], edge[1], 1) ) # the last entry is the label self.g_train.remove_edge(*edge) count += 1 if count % 1000 == 0: print("Removed", count, "edges") if count == num_edges_to_remove: return removed_edges return removed_edges def _reduce_graph(self, minedges, p=0.5): """ Reduces the graph self.g_train by a factor p by removing existing edges not on minedges list such that the reduced tree remains connected. Edge type is ignored and all edges are treated equally. Args: minedges: Minimum spanning tree edges that cannot be removed p: Factor by which to reduce the size of the graph Returns: Returns the list of edges removed from self.g_train (also modifies self.g_train by removing the said edges) """ # copy the original graph and start over in case this is not the first time # reduce_graph has been called. self.g_train = self.g.copy() # For multigraphs we should probably use keys use_keys_in_edges = self.g.is_multigraph() # For NX 1.x/2.x compatibilty we need to match length of minedges if len(minedges) > 0: use_keys_in_edges = len(next(iter(minedges))) == 3 if use_keys_in_edges: all_edges = list(self.g_train.edges(keys=True)) else: all_edges = list(self.g_train.edges()) num_edges_to_remove = int( (self.g_train.number_of_edges() - len(self.minedges)) * p ) # shuffle the edges self._random.shuffle(all_edges) # iterate over the list of edges and for each edge if the edge is not in minedges, remove it from the graph # until num_edges_to_remove edges have been removed and the graph reduced to p of its original size count = 0 removed_edges = [] for edge in all_edges: if edge not in minedges: removed_edges.append( (edge[0], edge[1], 1) ) # the last entry is the label self.g_train.remove_edge(*edge) count += 1 if count % 1000 == 0: print("Removed", count, "edges") if count == num_edges_to_remove: return removed_edges def _sample_negative_examples_by_edge_type_local_dfs( self, p=0.5, probs=None, edges_positive=None, edge_label=None, limit_samples=None, ): """ This method produces a list of edges that don't exist in graph self.g (negative examples). The number of negative edges produced is equal to the number of edges in the graph times p (that should be in the range (0,1] or limited to maximum limit_samples if the latter is not None. The negative samples are between node types as inferred from the edge type of the positive examples previously removed from the graph and given in edges_positive. This method uses depth-first search to efficiently (memory-wise) sample negative edges based on the local neighbourhood of randomly (uniformly) sampled source nodes at distances defined by the probabilities in probs. The source graph is not modified. Args: p: Factor that multiplies the number of edges in the graph and determines the number of negative edges to be sampled. probs: Probability distribution for the distance between source and target nodes. edges_positive: The positive edge examples that have previously been removed from the graph edge_label: The edge type to sample negative examples of limit_samples: It limits the maximum number of samples to the given number, if not None Returns: A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph. """ if probs is None: probs = [0.0, 0.25, 0.50, 0.25] print( "Warning: Using default sampling probabilities up to 3 hops from source node with values {}".format( probs ) ) if not isclose(sum(probs), 1.0): raise ValueError("Sampling probabilities do not sum to 1") self.negative_edge_node_distances = [] n = len(probs) # determine the number of edges in the graph that have edge_label type # Multiply this number by p to determine the number of positive edge examples to sample all_edges = self._get_edges(edge_label=edge_label) num_edges_total = len(all_edges) print("Network has {} edges of type {}".format(num_edges_total, edge_label)) # num_edges_to_sample = int(num_edges_total * p) if limit_samples is not None: if num_edges_to_sample > limit_samples: num_edges_to_sample = limit_samples edge_source_target_node_types = self._get_edge_source_and_target_node_types( edges=edges_positive ) if self.g_master is None: edges = self.g.edges() else: edges = self.g_master.edges() # to speed up lookup of edges in edges list, create a set the values stored are the concatenation of # the source and target node ids. edges_set = set(edges) edges_set.update({(e[1], e[0]) for e in edges}) sampled_edges_set = set() start_nodes = list(self.g.nodes(data=True)) nodes_dict = {node[0]: node[1]["label"] for node in start_nodes} count = 0 sampled_edges = [] num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) + 1 for _ in np.arange(0, num_iter): self._random.shuffle(start_nodes) # sample the distance to the target node using probs target_node_distances = ( self._random.choice(n, len(start_nodes), p=probs) + 1 ) for u, d in zip(start_nodes, target_node_distances): # perform DFS search up to d distance from the start node u. visited = { node[0]: False for node in start_nodes } # for marking already visited nodes nodes_stack = list() # start at node u nodes_stack.append((u[0], 0)) # tuple is (node, depth) while len(nodes_stack) > 0: next_node = nodes_stack.pop() v = next_node[0] # retrieve node id dv = next_node[1] # retrieve node distance from u if not visited[v]: visited[v] = True # Check if this nodes is at depth d; if it is, then this could be selected as the # target node for a negative edge sample. Otherwise add its neighbours to the stack, only # if the depth is less than the search depth d. if dv == d: u_v_edge_type = (nodes_dict[u[0]], nodes_dict[v]) # if no edge between u and next_node[0] then this is the sample, so record and stop # searching # Note: The if statement below is very expensive to evaluate because it need to checks # the membership of an element in a number of lists that can grow very large for large # graphs and number examples to sample. Later, we should have a closer look at how we can # speed this up. if ( (u_v_edge_type in edge_source_target_node_types) and (u[0] != v) and ((u[0], v) not in edges_set) and ((u[0], v) not in sampled_edges_set) ): sampled_edges.append( (u[0], v, 0) ) # the last entry is the class label sampled_edges_set.add((u[0], v)) sampled_edges_set.add((v, u[0])) count += 1 if count % 1000 == 0: print("Sampled {} negatives".format(count)) self.negative_edge_node_distances.append(d) break elif dv < d: neighbours = list(nx.neighbors(self.g, v)) self._random.shuffle(neighbours) neighbours = [(k, dv + 1) for k in neighbours] nodes_stack.extend(neighbours) if count == num_edges_to_sample: return sampled_edges return sampled_edges def _sample_negative_examples_local_dfs( self, p=0.5, probs=None, limit_samples=None ): """ This method produces a list of edges that don't exist in graph self.g (negative examples). The number of negative edges produced is equal to the number of edges in the graph times p (that should be in the range (0,1] or limited to maximum limit_samples if the latter is not None. This method uses depth-first search to efficiently (memory-wise) sample negative edges based on the local neighbourhood of randomly (uniformly) sampled source nodes at distances defined by the probabilities in probs. The source graph is not modified. Args: p: Factor that multiplies the number of edges in the graph and determines the number of no-edges to be sampled. probs: Probability distribution for the distance between source and target nodes. limit_samples: It limits the maximum number of samples to the given number, if not None Returns: A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph. """ if probs is None: probs = [0.0, 0.25, 0.50, 0.25] print( "Warning: Using default sampling probabilities up to 3 hops from source node with values {}".format( probs ) ) if not isclose(sum(probs), 1.0): raise ValueError("Sampling probabilities do not sum to 1") self.negative_edge_node_distances = [] n = len(probs) if self.minedges is None: num_edges_to_sample = int(self.g.number_of_edges() * p) else: num_edges_to_sample = int( (self.g.number_of_edges() - len(self.minedges)) * p ) if limit_samples is not None: if num_edges_to_sample > limit_samples: num_edges_to_sample = limit_samples if self.g_master is None: edges = self.g.edges() else: edges = self.g_master.edges() # to speed up lookup of edges in edges list, create a set the values stored are the concatenation of # the source and target node ids. edges_set = set(edges) edges_set.update({(e[1], e[0]) for e in edges}) sampled_edges_set = set() start_nodes = list(self.g.nodes(data=False)) count = 0 sampled_edges = [] num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) for _ in np.arange(0, num_iter): self._random.shuffle(start_nodes) # sample the distance to the target node using probs target_node_distances = ( self._random.choice(n, len(start_nodes), p=probs) + 1 ) for u, d in zip(start_nodes, target_node_distances): # perform DFS search up to d distance from the start node u. visited = {node: False for node in start_nodes} nodes_stack = list() # start at node u nodes_stack.append((u, 0)) # tuple is node, depth while len(nodes_stack) > 0: next_node = nodes_stack.pop() v = next_node[0] dv = next_node[1] if not visited[v]: visited[v] = True # Check if this nodes is at depth d; if it is, then this could be selected as the # target node for a negative edge sample. Otherwise add its neighbours to the stack, only # if the depth is less than the search depth d. if dv == d: # if no edge between u and next_node[0] then this is the sample, so record and stop # searching if ( (u != v) and ((u, v) not in edges_set) and ((u, v) not in sampled_edges_set) ): sampled_edges.append( (u, v, 0) ) # the last entry is the class label sampled_edges_set.add((u, v)) sampled_edges_set.add((v, u)) count += 1 self.negative_edge_node_distances.append(d) if count % 1000 == 0: print("Sampled {} negatives".format(count)) break elif dv < d: neighbours = list(nx.neighbors(self.g, v)) self._random.shuffle(neighbours) neighbours = [(k, dv + 1) for k in neighbours] nodes_stack.extend(neighbours) if count == num_edges_to_sample: return sampled_edges return sampled_edges def _sample_negative_examples_local_bfs( self, p=0.5, probs=None, limit_samples=None ): """ Deprecated: Replaced by method that use DFS and are much more efficient in terms of memory usage. This method produces a list of edges that don't exist in graph self.g (negative examples). The number of negative edges produced is equal to the number of edges in the graph times p (that should be in the range (0,1] or limited to maximum limit_samples if the latter is not None. The source graph is not modified. Args: p: factor that multiplies the number of edges in the graph and determines the number of negative edges to be sampled. probs: Probability distribution for the distance between source and target nodes. limit_samples: It limits the maximum number of sample to the given number, if not None Returns: A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph. """ if probs is None: probs = [0.0, 0.25, 0.50, 0.25] print("Warning: Using default sampling probabilities {}".format(probs)) self.negative_edge_node_distances = [] n = len(probs) num_edges_to_sample = int(self.g.number_of_edges() * p) if limit_samples is not None: if num_edges_to_sample > limit_samples: num_edges_to_sample = limit_samples if self.g_master is None: edges = list(self.g.edges()) else: edges = list(self.g_master.edges()) start_nodes = list(self.g.nodes(data=False)) count = 0 sampled_edges = list() num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) for _ in np.arange(0, num_iter): self._random.shuffle(start_nodes) target_node_distances = ( self._random.choice(n, len(start_nodes), p=probs) + 1 ) for u, d in zip(start_nodes, target_node_distances): # collect all the nodes that are d links away from u nodes_at_frontier = list() nodes_at_frontier.append(u) for _ in np.arange(d): next_level_nodes = list() if len(nodes_at_frontier) == 0: break [ next_level_nodes.extend(nx.neighbors(self.g, n)) for n in nodes_at_frontier ] nodes_at_frontier = next_level_nodes if len(nodes_at_frontier) == 0: break # check if u, v where v in nodes_at_frontier have an edge. The first pair that has no edge in the graph # becomes a negative sample self._random.shuffle(nodes_at_frontier) for v in nodes_at_frontier: if ( (u != v) and ((u, v) not in edges) and ((v, u) not in edges) and ((u, v, 0) not in sampled_edges) and ((v, u, 0) not in sampled_edges) ): sampled_edges.append( (u, v, 0) ) # the last entry is the class label count += 1 self.negative_edge_node_distances.append(d) break if count == num_edges_to_sample: return sampled_edges return sampled_edges def _sample_negative_examples_global(self, p=0.5, limit_samples=None): """ This method samples uniformly at random nodes from the graph and, if they don't have an edge in the graph, it records the pair as a negative edge. Args: p: factor that multiplies the number of edges in the graph and determines the number of negative edges to be sampled. limit_samples: it limits the maximum number of samples to the given number, if not None Returns: A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph. """ self.negative_edge_node_distances = [] if self.minedges is None: num_edges_to_sample = int(self.g.number_of_edges() * p) else: num_edges_to_sample = int( (self.g.number_of_edges() - len(self.minedges)) * p ) if limit_samples is not None: if num_edges_to_sample > limit_samples: num_edges_to_sample = limit_samples if self.g_master is None: edges = list(self.g.edges()) else: edges = list(self.g_master.edges()) # to speed up lookup of edges in edges list, create a set the values stored are the concatenation of # the source and target node ids. edges_set = set(edges) edges_set.update({(u[1], u[0]) for u in edges}) sampled_edges_set = set() start_nodes = list(self.g.nodes(data=False)) end_nodes = list(self.g.nodes(data=False)) count = 0 sampled_edges = [] num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) + 1 for _ in np.arange(0, num_iter): self._random.shuffle(start_nodes) self._random.shuffle(end_nodes) for u, v in zip(start_nodes, end_nodes): if ( (u != v) and ((u, v) not in edges_set) and ((u, v) not in sampled_edges_set) ): sampled_edges.append((u, v, 0)) # the last entry is the class label sampled_edges_set.update( {(u, v), (v, u)} ) # test for bi-directional edges count += 1 if count == num_edges_to_sample: return sampled_edges if count % 1000 == 0: print("Sampled {} negative examples".format(count)) return sampled_edges def _sample_negative_examples_by_edge_type_global( self, edges, edge_label, p=0.5, limit_samples=None ): """ This method produces a list of edges that don't exist in graph self.g (negative examples). The number of negative edges produced is equal to the number of edges with label edge_label in the graph times p (that should be in the range (0,1] or limited to maximum limit_samples if the latter is not None. The negative samples are between node types as inferred from the edge type of the positive examples previously removed from the graph and given in edges_positive. The source graph is not modified. Args: edges: The positive edge examples that have previously been removed from the graph edge_label: The edge type to sample negative examples of p: Factor that multiplies the number of edges in the graph and determines the number of negative edges to be sampled. limit_samples: It limits the maximum number of samples to the given number, if not None Returns: A list of 2-tuples that are pairs of node IDs that don't have an edge between them in the graph. """ self.negative_edge_node_distances = [] # determine the number of edges in the graph that have edge_label type # Multiply this number by p to determine the number of positive edge examples to sample all_edges = self._get_edges(edge_label=edge_label) num_edges_total = len(all_edges) print("Network has {} edges of type {}".format(num_edges_total, edge_label)) # num_edges_to_sample = int(num_edges_total * p) if limit_samples is not None: if num_edges_to_sample > limit_samples: num_edges_to_sample = limit_samples edge_source_target_node_types = self._get_edge_source_and_target_node_types( edges=edges ) # to speed up lookup of edges in edges list, create a set the values stored are the concatenation of # the source and target node ids. edges_set = set(edges) edges_set.update({(u[1], u[0]) for u in edges}) sampled_edges_set = set() start_nodes = list(self.g.nodes(data=True)) end_nodes = list(self.g.nodes(data=True)) count = 0 sampled_edges = [] num_iter = int(np.ceil(num_edges_to_sample / (1.0 * len(start_nodes)))) + 1 for _ in np.arange(0, num_iter): self._random.shuffle(start_nodes) self._random.shuffle(end_nodes) for u, v in zip(start_nodes, end_nodes): u_v_edge_type = (u[1]["label"], v[1]["label"]) if ( (u_v_edge_type in edge_source_target_node_types) and (u != v) and ((u[0], v[0]) not in edges_set) and ((u[0], v[0]) not in sampled_edges_set) ): sampled_edges.append( (u[0], v[0], 0) ) # the last entry is the class label sampled_edges_set.update({(u[0], v[0]), (v[0], u[0])}) count += 1 if count % 1000 == 0: print("Sampled", count, "negative edges") if count == num_edges_to_sample: return sampled_edges return sampled_edges def _get_minimum_spanning_edges(self): """ Given an undirected graph, it calculates the minimum set of edges such that graph connectivity is preserved. Returns: The minimum spanning edges of the undirected graph self.g """ mst = nx.minimum_spanning_edges(self.g, data=False) edges = list(mst) # to speed up lookup of edges in edges list, create a set the values stored are the concatenation of # the source and target node ids. self.minedges_set = {(u[0], u[1]) for u in edges} self.minedges_set.update({(u[1], u[0]) for u in edges}) return edges PK!IWIWstellargraph/data/epgm.py# -*- coding: utf-8 -*- # # Copyright 2017-2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import OrderedDict import networkx as nx from networkx.readwrite import json_graph import os import json import uuid import numpy as np import chardet import scipy.sparse as sp import multiprocessing from multiprocessing import Pool from functools import partial # from progressbar import ProgressBar, SimpleProgress from time import sleep def node_neighbours(v, edges): """Returns a list of neighbours of vertex v""" return (v, [e[1] for e in edges if e[0] == v]) def node_neighbours_extended(v, nodes, edges): """Returns a list of neighbours of vertex v""" nodes = np.array(nodes) edges = np.array(edges) mask = np.in1d(edges[:, 0], v) return (v, np.where(np.in1d(nodes, edges[mask, 1])), edges[mask, 1]) class EPGM(object): """EPGM class with converter methods to edgelist, adjacency matrix, etc.""" def _progress(self, item_name, n, step, arg, i): """Display progress of a loop, e.g., list or dict comprehension statement""" if (i + 1) % step == 0: print( "{}> {} {} out of {} processed ({}%)".format( "-" * int((i / n) * 80), item_name, i + 1, n, round((i + 1) / n * 100, 1), ) ) return arg def _progressbarUpdate(self, pbar, step, arg, i): """update progress bar""" if (i + 1) % step == 0: pbar.update(i + 1) return arg @classmethod def _nx_to_json(self, G): # Convert G to json: G_json = json_graph.node_link_data(G) G_json["graph"].update({"id": G.id}) # Convert G_json['nodes'] to strings: for n in G_json["nodes"]: n["id"] = str(n["id"]) # Fix source and target of G_json['links'] to actual node ids: for l in G_json["links"]: l["source"] = str(G_json["nodes"][l["source"]]["id"]) l["target"] = str(G_json["nodes"][l["target"]]["id"]) return G_json @classmethod def _json_to_epgm(self, G_json, node_attributes, node_labels): """Create G_epmg from G_json""" G_epgm = { "graphs": [OrderedDict()], "vertices": [OrderedDict(n) for n in G_json["nodes"]], "edges": [OrderedDict(e) for e in G_json["links"]], } # update G_epgm['graphs']: G_epgm["graphs"][0].update( OrderedDict( [ ("id", G_json["graph"]["id"]), ("data", {}), ("meta", {"label": G_json["graph"].get("name", "unnamed")}), ] ) ) # update G_epgm['vertices']: if node_labels is None: node_labels = [""] * len(G_epgm["vertices"]) for ind, v in enumerate(G_epgm["vertices"]): if node_attributes is not None: data = { str(k): str(v) for k, v in node_attributes.iloc[ind].items() if v != 0 } # extract non-zero node attributes (sparse node attributes) else: data = {} v.update( OrderedDict( [ ("data", data), ( "meta", { "label": str(node_labels[ind]), "graphs": [G_json["graph"]["id"]], }, ), ] ) ) # update G_epgm['edges']: edges_key_order = ["id", "source", "target", "data", "meta"] for e in G_epgm["edges"]: e.update( OrderedDict( [ ("id", uuid.uuid4().hex), ("data", {}), ("meta", {"label": "", "graphs": [G_json["graph"]["id"]]}), ] ) ) # reorder keys in e to edges_key_order: G_epgm["edges"] = [ OrderedDict((k, e[k]) for k in edges_key_order) for e in G_epgm["edges"] ] return G_epgm @classmethod def _reorder_keys(self, G): """Apply correct order of keys in self.G""" # Graphs: graphs_key_order = ["id", "data", "meta"] # reorder keys in g to graphs_key_order: G["graphs"] = [ OrderedDict((k, g[k]) for k in graphs_key_order) for g in G["graphs"] ] # Vertices: vertices_key_order = ["id", "data", "meta"] # reorder keys in v to vertices_key_order: G["vertices"] = [ OrderedDict((k, v[k]) for k in vertices_key_order) for v in G["vertices"] ] # Edges: edges_key_order = ["id", "source", "target", "data", "meta"] # reorder keys in e to edges_key_order: G["edges"] = [ OrderedDict((k, e[k]) for k in edges_key_order) for e in G["edges"] ] return G @classmethod def load(self, path): """Load graphs from EPGM path""" if not os.path.isdir(path): raise Exception("Path {} does not exist!".format(path)) G_epgm = {"graphs": list(), "vertices": list(), "edges": list()} for k in G_epgm.keys(): # detect the codec: fname = os.path.join(path, str(k) + ".json") # with open(fname, 'rb') as fb: # open the file for reading in binary format # line = fb.readline() # read the 1st line and evalulate its length # fb.seek(0) # return to the start of the file # enc = chardet.detect(fb.read(1000*len(line)))['encoding'] # detect the encoding from up to 1000 first lines in the file enc = "utf-8" # just use 'utf-8' for all files with open(fname, "r", encoding=enc) as fp: print("...reading {} using {} encoding...".format(fp.name, enc)) lines = fp.readlines() if isinstance(G_epgm[k], list): for l in lines: G_epgm[k].append(json.loads(l)) else: raise Exception( "type(G[", k, "]): unexpected type", type(G_epgm[k]), ", stopping." ) G_epgm = self._reorder_keys(G=G_epgm) return G_epgm def node_types(self, graph_id): """List node labels(types) in graph with graph_id""" if not any([graph_id in g["id"] for g in self.G["graphs"]]): raise Exception("Graph with id {} does not exist".format(graph_id)) node_types = [ v["meta"]["label"] for v in self.G["vertices"] if graph_id in v["meta"]["graphs"] ] return np.unique(node_types) def node_attributes(self, graph_id, node_type): """Return a list of node attribute names, for nodes of node_type type, that belong to graph with graph_id""" x_ind = [] nodes = [ v for v in self.G["vertices"] if (graph_id in v["meta"]["graphs"]) and (node_type in v["meta"]["label"]) ] for v in nodes: for k in list(v["data"].keys()): x_ind.append(k) return np.unique(x_ind) def node_attr_dim(self, graph_id, node_type): """Return the dimensionality of node attributes, for nodes of node_type type, that belong to graph with graph_id""" return len(np.unique(self.node_attributes(graph_id, node_type))) def __init__(self, G, node_attributes=None, node_labels=None): if "networkx.classes.graph.Graph" in str(G.__class__): G_json = self._nx_to_json(G) self.G = self._json_to_epgm(G_json, node_attributes, node_labels) elif "str" in str(G.__class__): # this assumes that G is the path to EPGM graph self.G = self.load(G) else: raise Exception("G has unknown class ", str(G.__class__)) self.G_nx = {} # placeholder for storing graphs in networkx format (populated by calling the .to_nx() method) def append(self, G_add): """Update self.G by adding a new graph G_json_add""" G_json_add = self._nx_to_json(G_add) if any([G_json_add["graph"]["id"] in g["id"] for g in self.G["graphs"]]): raise Exception( "Graph with id {} already exists".format(G_json_add["graph"]["id"]) ) # update self.G['graphs']: G_epgm_add = self._json_to_epgm(G_json_add, None, None) # was None, None, None self.G["graphs"].append(G_epgm_add["graphs"][0]) # update self.G['vertices']: G_vertices = set([v["id"] for v in self.G["vertices"]]) G_add_vertices = set([v["id"] for v in G_epgm_add["vertices"]]) common_vertices = list(G_vertices.intersection(G_add_vertices)) new_vertices = list(G_add_vertices.difference(G_vertices)) # update common vertices in self.G and G_epgm_add: for v_id in common_vertices: i = [v["id"] == v_id for v in self.G["vertices"]].index( True ) # find the vertex index in self.G self.G["vertices"][i]["meta"]["graphs"].append( G_epgm_add["graphs"][0]["id"] ) # append new vertices to self.G['vertices']: for v_id in new_vertices: i = [v["id"] == v_id for v in G_epgm_add["vertices"]].index( True ) # find the index of v_id in G_epgm_add['vertices'] self.G["vertices"].append( G_epgm_add["vertices"][i] ) # add the vertex v_id to self.G['vertices'] # update self.G['edges']: G_edges = set([(e["source"], e["target"]) for e in self.G["edges"]]) G_add_edges = set([(e["source"], e["target"]) for e in G_epgm_add["edges"]]) common_edges = list(G_edges.intersection(G_add_edges)) new_edges = list(G_add_edges.difference(G_edges)) # update common edges in self.G and G_epgm_add: for e_id in common_edges: i = [(e["source"], e["target"]) == e_id for e in self.G["edges"]].index( True ) # find the edge index in self.G self.G["edges"][i]["meta"]["graphs"].append(G_epgm_add["graphs"][0]["id"]) # append new edges to self.G['edges']: for e_id in new_edges: i = [(e["source"], e["target"]) == e_id for e in G_epgm_add["edges"]].index( True ) # find the index of e_id in G_epgm_add['edges'] self.G["edges"].append( G_epgm_add["edges"][i] ) # add the vertex e_id to self.G['edges'] def to_nx_OLD( self, graph_id, directed=False, parallel_processing=True, n_jobs=multiprocessing.cpu_count(), progress=True, chunksize=100, ): """Convert the graph specified by its graph_id to networkx graph""" if ( graph_id in self.G_nx.keys() ): # if self.G_nx[graph_id] already exists, just return it, otherwise evaluate it return self.G_nx[graph_id] else: print("Converting the EPGM graph {} to NetworkX graph...".format(graph_id)) if not any([graph_id in g["id"] for g in self.G["graphs"]]): raise Exception("Graph with id {} does not exist".format(graph_id)) # List relevant nodes and edges: print("...extracting relevant nodes...", end="") nodes = [ v["id"] for v in self.G["vertices"] if graph_id in v["meta"]["graphs"] ] print(" ...{} nodes extracted...".format(len(nodes))) print("...extracting relevant edges...", end="") edges = [ (e["source"], e["target"]) for e in self.G["edges"] if graph_id in e["meta"]["graphs"] ] print(" ...{} edges extracted...".format(len(edges))) # TODO: implement the case of weighted edges # create a graph as dict of lists in the format (node_id: [neighbour nodes]) print("...building the graph as dict of lists...") print( "...[parallel_processing: {}, n_jobs: {}, progress_bar: {}]".format( parallel_processing, n_jobs, progress ) ) if parallel_processing: # parallel execution pool = Pool(processes=n_jobs) if progress: n = len(nodes) self.G_nx[graph_id] = [] # pbar = ProgressBar( # widgets=[ # SimpleProgress( # format="%(value_s)s of %(max_value_s)s nodes processed (%(percentage)3d%%)" # ) # ], # maxval=n, # ).start() # _ = [pool.apply_async(partial(node_neighbours, edges=edges), args=(v,), # callback=self.G_nx[graph_id].append) for v in nodes] # it seems that appending results using callback works much slower than either pool.map_async or pool.map # while len(self.G_nx[graph_id]) != n: # pbar.update(len(self.G_nx[graph_id])) # sleep(1) graph = pool.imap( partial(node_neighbours, edges=edges), nodes, chunksize ) # lazy map # evaluate batches of imap, as the progress bar is being updated: while len(self.G_nx[graph_id]) != n: self.G_nx[graph_id].append(next(graph)) # pbar.update(len(self.G_nx[graph_id])) # pbar.finish() self.G_nx[graph_id] = dict(self.G_nx[graph_id]) else: self.G_nx[graph_id] = dict( pool.map(partial(node_neighbours, edges=edges), nodes) ) pool.close() pool.join() else: # sequential execution self.G_nx[graph_id] = { v: [e[1] for e in edges if e[0] == v] for v in nodes } # this works ~2.5x faster (for cora dataset) than the above for loop print("...converting the graph to nx format...") self.G_nx[graph_id] = nx.from_dict_of_lists(self.G_nx[graph_id]) if directed: self.G_nx[graph_id] = self.G_nx[graph_id].to_directed() else: self.G_nx[graph_id] = self.G_nx[graph_id].to_undirected() return self.G_nx[graph_id] def to_nx(self, graph_id, directed=False, *args): """Convert the graph specified by its graph_id to networkx Directed Multi-graph""" if ( False ): # graph_id in self.G_nx.keys(): # if self.G_nx[graph_id] already exists, just return it, otherwise evaluate it return self.G_nx[graph_id] else: # we always re-calculate self.G_nx, since directed argument can change print("Converting the EPGM graph {} to NetworkX graph...".format(graph_id)) if not any([graph_id in g["id"] for g in self.G["graphs"]]): raise Exception("Graph with id {} does not exist".format(graph_id)) self.G_nx[ graph_id ] = ( nx.MultiDiGraph() ) # create an empty directed graph that can store multiedges # add nodes to self.G_nx[graph_id], together with their attributes stored in 'data': self.G_nx[graph_id].add_nodes_from( [ (v["id"], {**v["data"], **{"label": v["meta"].get("label", "")}}) for v in self.G["vertices"] ] ) # add edges to self.G_nx[graph_id], together with their attributes stored in 'data': # I have added the edge label in the edge data; sets the label to '' if the edges don't have a label self.G_nx[graph_id].add_edges_from( [ ( e["source"], e["target"], e["id"], {**e["data"], **{"label": e["meta"].get("label", "")}}, ) for e in self.G["edges"] ] ) if not directed: self.G_nx[graph_id] = self.G_nx[graph_id].to_undirected() return self.G_nx[graph_id] def adjacency(self, graph_id, directed=False): """Return adjacency matrix of a graph specified by its graph_id""" if not any([graph_id in g["id"] for g in self.G["graphs"]]): raise Exception("Graph with id {} does not exist".format(graph_id)) print("...building the adjacency matrix...") nodes = [v["id"] for v in self.G["vertices"] if graph_id in v["meta"]["graphs"]] adj = nx.adjacency_matrix( self.to_nx(graph_id, directed), nodelist=nodes ) # ensure the nodes in adj are ordered the same as in the epgm graph return adj # def adjacency_sans_nx(self, graph_id, directed=False): # """Compose the adjacency matrix of a graph specified by its graph_id, NOT using networkx""" # if not any([graph_id in g['id'] for g in self.G['graphs']]): # raise Exception("Graph with id {} does not exist".format(graph_id)) # # # List relevant nodes and edges: # nodes = [v['id'] for v in self.G['vertices'] if graph_id in v['meta']['graphs']] # edges = [(e['source'], e['target']) for e in self.G['edges'] if graph_id in e['meta']['graphs']] # # n_nodes = len(nodes) # adj = sp.lil_matrix((n_nodes, n_nodes), dtype=int) # TODO: implement the case of weighted edges # for i, v in enumerate(nodes): # data = data = partial(node_neighbours_extended, nodes=nodes, edges=edges)(v) # node_neighbours_extended(v, nodes, edges) # nbr_idx = data[1][0] # adj.rows[i] = list(nbr_idx) # adj.data[i] = [1] * len(nbr_idx) # # TODO: implement the case of weighted edges # # if not directed: # symmetrize the adj matrix # adj = adj + adj.T - sp.diags(adj.diagonal(), dtype=int) # # return adj # def adjacency_from_edgelist(self, graph_id, directed=False): # """Return adjacency matrix of a graph specified by its graph_id""" # # FIXME: the order of nodes in the created graph, and hence in adj, is different from that in the EPGM graph. Do not use this unless it's fixed! # if not any([graph_id in g['id'] for g in self.G['graphs']]): # raise Exception("Graph with id {} does not exist".format(graph_id)) # # # An alternative way, works faster, BUT GIVES A DIFFERENT NODES ORDER!!! # # List relevant edges: # edges = [(int(e['source']), int(e['target'])) for e in self.G['edges'] if graph_id in e['meta']['graphs']] # # TODO: implement the case of weighted edges # graph = nx.from_edgelist(edges) # if directed: # graph = graph.to_directed() # else: # graph = graph.to_undirected() # # adj = nx.adjacency_matrix(graph) # # return adj def edgelist(self, graph_id, directed=False): """Return edgelist of a graph specified by its graph_id""" print("...extracting the edgelist...") # edgelist = nx.to_edgelist(self.to_nx(graph_id, directed)) edgelist = [ (e["source"], e["target"]) for e in self.G["edges"] if graph_id in e["meta"]["graphs"] ] # works much faster, gets the edgelist directly from the edges.json part of epgm graph print(" ...{} edges extracted...".format(len(edgelist))) return edgelist def save(self, path): """ Write self.G into three json files: graphs.json, vertices.json, and edges.json, in path directory """ self.path = path if not os.path.isdir(self.path): os.makedirs(self.path) for k in self.G.keys(): with open(os.path.join(self.path, str(k) + ".json"), "w") as fp: if isinstance(self.G[k], list): for l in self.G[k]: json.dump(l, fp) fp.write("\n") elif isinstance(self.G[k], OrderedDict) or isinstance(self.G[k], dict): json.dump(self.G[k], fp) else: print( "type(G[", k, "]): unexpected type", type(self.G[k]), ", stopping.", ) raise () def save_as_graphml(self, graph_id, fname, directed): """ Save the graph in GraphML XML format (e.g., for visualisation in gephi) Args: graph_id: unique id of the graph fname: file name to save the graphml into Returns: Exit code of nx.write_graphml() """ if not any([graph_id in g["id"] for g in self.G["graphs"]]): raise Exception("Graph with id {} does not exist".format(graph_id)) return nx.write_graphml(nx.DiGraph(self.to_nx(graph_id, directed)), path=fname) PK!ywwstellargraph/data/explorer.py# -*- coding: utf-8 -*- # # Copyright 2017-2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. __all__ = [ "UniformRandomWalk", "BiasedRandomWalk", "UniformRandomMetaPathWalk", "DepthFirstWalk", "BreadthFirstWalk", "SampledBreadthFirstWalk", "SampledHeterogeneousBreadthFirstWalk", ] import networkx as nx import numpy as np import random from collections import defaultdict from ..core.schema import GraphSchema from ..core.graph import StellarGraphBase from ..core.utils import is_real_iterable class GraphWalk(object): """ Base class for exploring graphs. """ def __init__(self, graph, seed=None, graph_schema=None): # Initialize the random state rs = random.getstate() random.seed(seed) self._random_state = random.getstate() random.setstate(rs) self.graph = graph # We require a StellarGraph for this if not isinstance(graph, StellarGraphBase): raise TypeError( "Graph must be a StellarGraph or StellarDiGraph to use heterogeneous sampling." ) if not graph_schema: self.graph_schema = self.graph.create_graph_schema(create_type_maps=True) else: self.graph_schema = graph_schema if self.graph_schema is not None and type(self.graph_schema) is not GraphSchema: raise ValueError( "({}) The parameter graph_schema should be either None or of type GraphSchema.".format( type(self).__name__ ) ) # Create a dict of adjacency lists per edge type, for faster neighbour sampling from graph in SampledHeteroBFS: # TODO: this could be better placed inside StellarGraph class edge_types = self.graph_schema.edge_types self.adj = dict() for et in edge_types: self.adj.update({et: defaultdict(lambda: [None])}) for n1, nbrdict in graph.adjacency(): for et in edge_types: neigh_et = [ n2 for n2, nkeys in nbrdict.items() for k in iter(nkeys) if self.graph_schema.is_of_edge_type((n1, n2, k), et) ] self.adj[et][n1] = neigh_et def neighbors(self, graph, node): if node not in graph: print("node {} not in graph".format(node)) print("Graph nodes {}".format(graph.nodes())) return list(nx.neighbors(graph, node)) def run(self, **kwargs): """ To be overridden by subclasses. It is the main entry point for performing random walks on the given graph. It should return the sequences of nodes in each random walk. Args: **kwargs: Returns: """ class UniformRandomWalk(GraphWalk): """ Performs uniform random walks on the given graph """ def run(self, nodes=None, n=None, length=None, seed=None): """ Perform a random walk starting from the root nodes. Args: nodes: The root nodes as a list of node IDs n: Total number of random walks per root node length: Maximum length of each random walk seed: Random number generator seed; default is None Returns: List of lists of nodes ids for each of the random walks """ self._check_parameter_values(nodes=nodes, n=n, length=length, seed=seed) rs = random.getstate() if seed: # seed the random number generator random.seed(seed) else: # Restore the random state random.setstate(self._random_state) walks = [] for node in nodes: # iterate over root nodes for walk_number in range(n): # generate n walks per root node walk = list() current_node = node for _ in range(length): walk.extend([current_node]) neighbours = self.neighbors(self.graph, current_node) if ( len(neighbours) == 0 ): # for whatever reason this node has no neighbours so stop break else: random.shuffle(neighbours) # shuffles the list in place current_node = neighbours[0] # select the first node to follow walks.append(walk) # Store current random state and restore original random state self._random_state = random.getstate() random.setstate(rs) return walks def _check_parameter_values(self, nodes, n, length, seed): """ Checks that the parameter values are valid or raises ValueError exceptions with a message indicating the parameter (the first one encountered in the checks) with invalid value. Args: nodes: A list of root node ids such that from each node a uniform random walk of up to length l will be generated. n: Number of walks per node id. length: Maximum length of walk measured as the number of edges followed from root node. seed: Random number generator seed """ if nodes is None: raise ValueError( "({}) A list of root node IDs was not provided.".format( type(self).__name__ ) ) if not is_real_iterable(nodes): raise ValueError("nodes parameter should be an iterable of node IDs.") if ( len(nodes) == 0 ): # this is not an error but maybe a warning should be printed to inform the caller print( "WARNING: ({}) No root node IDs given. An empty list will be returned as a result.".format( type(self).__name__ ) ) if type(n) != int: raise ValueError( "({}) The number of walks per root node, n, should be integer type.".format( type(self).__name__ ) ) if n <= 0: raise ValueError( "({}) The number of walks per root node, n, should be a positive integer.".format( type(self).__name__ ) ) if type(length) != int: raise ValueError( "({}) The walk length, length, should be integer type.".format( type(self).__name__ ) ) if length <= 0: raise ValueError( "({}) The walk length, length, should be positive integer.".format( type(self).__name__ ) ) if seed is not None: if type(seed) != int: raise ValueError( "({}) The random number generator seed value, seed, should be integer type or None.".format( type(self).__name__ ) ) if seed < 0: raise ValueError( "({}) The random number generator seed value, seed, should be positive integer or None.".format( type(self).__name__ ) ) class BiasedRandomWalk(GraphWalk): """ Performs biased second order random walks (like those used in Node2Vec algorithm https://snap.stanford.edu/node2vec/) controlled by the values of two parameters p and q. """ def run(self, nodes=None, n=None, p=1., q=1., length=None, seed=None): """ Perform a random walk starting from the root nodes. Args: nodes: The root nodes as a list of node IDs n: Total number of random walks per root node p: Defines probability, 1/p, of returning to source node q: Defines probability, 1/q, for moving to a node away from the source node length: Maximum length of each random walk seed: Random number generator seed; default is None Returns: List of lists of nodes ids for each of the random walks """ self._check_parameter_values( nodes=nodes, n=n, p=p, q=q, length=length, seed=seed ) rs = random.getstate() if seed: # seed the random number generator random.seed(seed) else: # Restore the random state random.setstate(self._random_state) ip = 1. / p iq = 1. / q walks = [] for node in nodes: # iterate over root nodes for walk_number in range(n): # generate n walks per root node walk = list() current_node = node # add the current node to the walk walk.extend([current_node]) # the neighbours of the current node # for isolated nodes the length of neighbours will be 0; we will test for this later neighbours = self.neighbors(self.graph, current_node) previous_node = current_node previous_node_neighbours = neighbours if len(neighbours) > 0: # special check for isolated root nodes # this is the root node so there is no previous node. The next node # is sampled with equal probability from all the neighbours current_node = neighbours[np.random.choice(a=len(neighbours))] for _ in range(length - 1): walk.extend([current_node]) # the neighbours of the current node neighbours = self.neighbors(self.graph, current_node) if ( len(neighbours) == 0 ): # for whatever reason this node has no neighbours so stop break else: # determine the sampling probabilities for all the nodes common_neighbours = set(neighbours).intersection( previous_node_neighbours ) # nodes that are in common between the previous and current nodes; these get # 1. transition probabilities probs = [iq] * len(neighbours) for i, nn in enumerate(neighbours): if nn == previous_node: # this is the previous node probs[i] = ip elif nn in common_neighbours: probs[i] = 1. # normalize the probabilities total_prob = sum(probs) probs = [m / total_prob for m in probs] previous_node = current_node # select the next node based on the calculated transition probabilities current_node = neighbours[ np.random.choice(a=len(neighbours), p=probs) ] walks.append(walk) # Store current random state and restore original random state self._random_state = random.getstate() random.setstate(rs) return walks def _check_parameter_values(self, nodes, n, p, q, length, seed): """ Checks that the parameter values are valid or raises ValueError exceptions with a message indicating the parameter (the first one encountered in the checks) with invalid value. Args: nodes: A list of root node ids such that from each node a uniform random walk of up to length l will be generated. n: Number of walks per node id. p: q: length: Maximum length of walk measured as the number of edges followed from root node. seed: Random number generator seed """ if nodes is None: raise ValueError( "({}) A list of root node IDs was not provided.".format( type(self).__name__ ) ) if not is_real_iterable(nodes): raise ValueError("nodes parameter should be an iterableof node IDs.") if ( len(nodes) == 0 ): # this is not an error but maybe a warning should be printed to inform the caller print( "WARNING: ({}) No root node IDs given. An empty list will be returned as a result.".format( type(self).__name__ ) ) if type(n) != int: raise ValueError( "({}) The number of walks per root node, n, should be integer type.".format( type(self).__name__ ) ) if n <= 0: raise ValueError( "({}) The number of walks per root node, n, should be a positive integer.".format( type(self).__name__ ) ) if p <= 0.: raise ValueError( "({}) Parameter p should be greater than 0.".format(type(self).__name__) ) if q <= 0.: raise ValueError( "({}) Parameter q should be greater than 0.".format(type(self).__name__) ) if type(length) != int: raise ValueError( "({}) The walk length, length, should be integer type.".format( type(self).__name__ ) ) if length <= 0: raise ValueError( "({}) The walk length, length, should be positive integer.".format( type(self).__name__ ) ) if seed is not None: if seed < 0: raise ValueError( "({}) The random number generator seed value, seed, should be positive integer or None.".format( type(self).__name__ ) ) if type(seed) != int: raise ValueError( "({}) The random number generator seed value, seed, should be integer type or None.".format( type(self).__name__ ) ) class UniformRandomMetaPathWalk(GraphWalk): """ For heterogeneous graphs, it performs uniform random walks based on given metapaths. """ def run( self, nodes=None, n=None, length=None, metapaths=None, node_type_attribute="label", seed=None, ): """ Performs metapath-driven uniform random walks on heterogeneous graphs. Args: nodes: The root nodes as a list of node IDs n: Total number of random walks per root node length: Maximum length of each random walk metapaths: List of lists of node labels that specify a metapath schema, e.g., [['Author', 'Paper', 'Author'], ['Author, 'Paper', 'Venue', 'Paper', 'Author']] specifies two metapath schemas of length 3 and 5 respectively. node_type_attribute: The node attribute name that stores the node's type seed: Random number generator seed; default is None Returns: List of lists of nodes ids for each of the random walks generated """ self._check_parameter_values( nodes=nodes, n=n, length=length, metapaths=metapaths, node_type_attribute=node_type_attribute, seed=seed, ) rs = random.getstate() if seed: # seed the random number generator random.seed(seed) else: # Restore the random state random.setstate(self._random_state) walks = [] for node in nodes: # retrieve node type label = self.graph.node[node][node_type_attribute] filtered_metapaths = [ metapath for metapath in metapaths if len(metapath) > 0 and metapath[0] == label ] for metapath in filtered_metapaths: # augment metapath to be length long # if ( # len(metapath) == 1 # ): # special case for random walks like in a homogeneous graphs # metapath = metapath * length # else: metapath = metapath[1:] * ((length // (len(metapath) - 1)) + 1) for _ in range(n): walk = [] # holds the walk data for this walk; first node is the starting node current_node = node for d in range(length): walk.append(current_node) # d+1 can also be used to index metapath to retrieve the node type for the next step in the walk neighbours = nx.neighbors(self.graph, node) # filter these by node type neighbours = [ node for node in neighbours if self.graph.node[node][node_type_attribute] == metapath[d] ] if len(neighbours) == 0: # if no neighbours of the required type as dictated by the metapath exist, then stop. break # select one of the neighbours uniformly at random current_node = random.choice( neighbours ) # the next node in the walk walks.append(walk) # store the walk # Store current random state and restore original random state self._random_state = random.getstate() random.setstate(rs) return walks def _check_parameter_values( self, nodes, n, length, metapaths, node_type_attribute, seed ): """ Checks that the parameter values are valid or raises ValueError exceptions with a message indicating the parameter (the first one encountered in the checks) with invalid value. Args: nodes: The starting nodes as a list of node IDs. n: Number of walks per node id. length: Maximum length of of each random walk metapaths: List of lists of node labels that specify a metapath schema, e.g., [['Author', 'Paper', 'Author'], ['Author, 'Paper', 'Venue', 'Paper', 'Author']] specifies two metapath schemas of length 3 and 5 respectively. node_type_attribute: The node attribute name that stores the node's type seed: Random number generator seed """ if nodes is None: raise ValueError( "({}) A list of starting node IDs was not provided (parameter nodes is None).".format( type(self).__name__ ) ) if not is_real_iterable(nodes): raise ValueError( "({}) The nodes parameter should be an iterable of node IDs.".format( type(self).__name__ ) ) if ( len(nodes) == 0 ): # this is not an error but maybe a warning should be printed to inform the caller print( "WARNING: ({}) No starting node IDs given. An empty list will be returned as a result.".format( type(self).__name__ ) ) if n <= 0: raise ValueError( "({}) The number of walks per starting node, n, should be a positive integer.".format( type(self).__name__ ) ) if type(n) != int: raise ValueError( "({}) The number of walks per starting node, n, should be integer type.".format( type(self).__name__ ) ) if length <= 0: raise ValueError( "({}) The walk length parameter, length, should be positive integer.".format( type(self).__name__ ) ) if type(length) != int: raise ValueError( "({}) The walk length parameter, length, should be integer type.".format( type(self).__name__ ) ) if type(metapaths) != list: raise ValueError( "({}) The metapaths parameter must be a list of lists.".format( type(self).__name__ ) ) for metapath in metapaths: if type(metapath) != list: raise ValueError( "({}) Each metapath must be list type of node labels".format( type(self).__name__ ) ) if len(metapath) < 2: raise ValueError( "({}) Each metapath must specify at least two node types".format( type(self).__name__ ) ) for node_label in metapath: if type(node_label) != str: raise ValueError( "({}) Node labels in metapaths must be string type.".format( type(self).__name__ ) ) if metapath[0] != metapath[-1]: raise ValueError( "({} The first and last node type in a metapath should be the same.".format( type(self).__name__ ) ) if type(node_type_attribute) != str: raise ValueError( "({}) The parameter label should be string type not {} as given".format( type(self).__name__, type(node_type_attribute).__name__ ) ) if seed is not None: if seed < 0: raise ValueError( "({}) The random number generator seed value, seed, should be positive integer or None.".format( type(self).__name__ ) ) if type(seed) != int: raise ValueError( "({}) The random number generator seed value, seed, should be integer type or None.".format( type(self).__name__ ) ) class DepthFirstWalk(GraphWalk): """ Depth First Walk that generates all paths from a starting node to a given depth. It can be used to extract, in a memory efficient way, a sub-graph starting from a node and up to a given depth. """ # TODO: Implement the run method pass class BreadthFirstWalk(GraphWalk): """ Breadth First Walk that generates all paths from a starting node to a given depth. It can be used to extract a sub-graph starting from a node and up to a given depth. """ # TODO: Implement the run method pass class SampledBreadthFirstWalk(GraphWalk): """ Breadth First Walk that generates a sampled number of paths from a starting node. It can be used to extract a random sub-graph starting from a set of initial nodes. """ def run(self, nodes=None, n=1, n_size=None, seed=None): """ Performs a sampled breadth-first walk starting from the root nodes. Args: nodes: A list of root node ids such that from each node n BFWs will be generated up to the given depth d. n: Number of walks per node id. n_size: The number of neighbouring nodes to expand at each depth of the walk. Sampling of neighbours with replacement is always used regardless of the node degree and number of neighbours requested. seed: Random number generator seed; default is None Returns: A list of lists such that each list element is a sequence of ids corresponding to a BFW. """ self._check_parameter_values(nodes=nodes, n=n, n_size=n_size, seed=seed) walks = [] d = len(n_size) # depth of search rs = random.getstate() if seed: # seed the random number generator random.seed(seed) else: # Restore the random state random.setstate(self._random_state) for node in nodes: # iterate over root nodes for _ in range(n): # do n bounded breadth first walks from each root node q = list() # the queue of neighbours walk = list() # the list of nodes in the subgraph of node # extend() needs iterable as parameter; we use list of tuples (node id, depth) q.extend([(node, 0)]) while len(q) > 0: # remove the top element in the queue # index 0 pop the item from the front of the list frontier = q.pop(0) depth = frontier[1] + 1 # the depth of the neighbouring nodes walk.extend([frontier[0]]) # add to the walk # consider the subgraph up to and including depth d from root node if depth <= d: neighbours = self.neighbors(self.graph, frontier[0]) if len(neighbours) == 0: break else: # sample with replacement neighbours = [ random.choice(neighbours) for _ in range(n_size[depth - 1]) ] # add them to the back of the queue q.extend([(sampled_node, depth) for sampled_node in neighbours]) # finished i-th walk from node so add it to the list of walks as a list walks.append(walk) # Store current random state and restore original random state self._random_state = random.getstate() random.setstate(rs) return walks def _check_parameter_values(self, nodes, n, n_size, seed): """ Checks that the parameter values are valid or raises ValueError exceptions with a message indicating the parameter (the first one encountered in the checks) with invalid value. Args: nodes: A list of root node ids such that from each node n BFWs will be generated up to the given depth d. n: Number of walks per node id. n_size: The number of neighbouring nodes to expand at each depth of the walk. seed: Random number generator seed; default is None """ if nodes is None: raise ValueError( "({}) A list of root node IDs was not provided (nodes parameter is None).".format( type(self).__name__ ) ) if not is_real_iterable(nodes): raise ValueError( "({}) The nodes parameter should be an iterable of node IDs.".format( type(self).__name__ ) ) if ( len(nodes) == 0 ): # this is not an error but maybe a warning should be printed to inform the caller print( "WARNING: ({}) No root node IDs given. An empty list will be returned as a result.".format( type(self).__name__ ) ) if type(n) != int: raise ValueError( "({}) The number of walks per root node, n, should be integer type.".format( type(self).__name__ ) ) if n <= 0: raise ValueError( "({}) The number of walks per root node, n, should be a positive integer.".format( type(self).__name__ ) ) if n_size is None: raise ValueError( "({}) The neighbourhood size, n_size, must be a list of integers not None.".format( type(self).__name__ ) ) if type(n_size) != list: raise ValueError( "({}) The neighbourhood size, n_size, must be a list of integers.".format( type(self).__name__ ) ) if len(n_size) == 0: raise ValueError( "({}) The neighbourhood size, n_size, should not be empty list.".format( type(self).__name__ ) ) for d in n_size: if type(d) != int: raise ValueError( "({}) The neighbourhood size, n_size, must be list of positive integers or 0.".format( type(self).__name__ ) ) if d < 0: raise ValueError( "({}) The neighbourhood size, n_size, must be list of positive integers or 0.".format( type(self).__name__ ) ) if seed is not None: if type(seed) != int: raise ValueError( "({}) The random number generator seed value, seed, should be integer type or None.".format( type(self).__name__ ) ) if seed < 0: raise ValueError( "({}) The random number generator seed value, seed, should be positive integer or None.".format( type(self).__name__ ) ) class SampledHeterogeneousBreadthFirstWalk(GraphWalk): """ Breadth First Walk for heterogeneous graphs that generates a sampled number of paths from a starting node. It can be used to extract a random sub-graph starting from a set of initial nodes. """ def run(self, nodes=None, n=1, n_size=None, seed=None): """ Performs a sampled breadth-first walk starting from the root nodes. Args: nodes: A list of root node ids such that from each node n BFWs will be generated with the number of samples per hop specified in n_size. n: Number of walks per node id. n_size: The number of neighbouring nodes to expand at each depth of the walk. Sampling of neighbours with replacement is always used regardless of the node degree and number of neighbours requested. graph_schema: If None then the graph schema is extracted from self.graph seed: Random number generator seed; default is None Returns: A list of lists such that each list element is a sequence of ids corresponding to a sampled Heterogeneous BFW. """ self._check_parameter_values( nodes=nodes, n=n, n_size=n_size, graph_schema=self.graph_schema, seed=seed ) walks = [] d = len(n_size) # depth of search rs = random.getstate() if seed: # seed the random number generator random.seed(seed) else: # Restore the random state random.setstate(self._random_state) for node in nodes: # iterate over root nodes for _ in range(n): # do n bounded breadth first walks from each root node q = list() # the queue of neighbours walk = list() # the list of nodes in the subgraph of node # Start the walk by adding the head node, and node type to the frontier list q node_type = self.graph_schema.get_node_type(node) q.extend([(node, node_type, 0)]) # add the root node to the walks walk.append([node]) while len(q) > 0: # remove the top element in the queue and pop the item from the front of the list frontier = q.pop(0) current_node, current_node_type, depth = frontier depth = depth + 1 # the depth of the neighbouring nodes # consider the subgraph up to and including depth d from root node if depth <= d: # Find edge types for current node type current_edge_types = self.graph_schema.schema[current_node_type] # Create samples of neigbhours for all edge types for et in current_edge_types: neigh_et = self.adj[et][current_node] # If there are no neighbours of this type then we return None # in the place of the nodes that would have been sampled # YT update: with the new way to get neigh_et from self.adj[et][current_node], len(neigh_et) is always > 0. # In case of no neighbours of the current node for et, neigh_et == [None], # and samples automatically becomes [None]*n_size[depth-1] if len(neigh_et) > 0: samples = [ random.choice(neigh_et) for _ in range(n_size[depth - 1]) ] # Choices limits us to Python 3.6+ # samples = random.choices(neigh_et, k=n_size[depth - 1]) else: # this doesn't happen anymore, see the comment above samples = [None] * n_size[depth - 1] walk.append(samples) q.extend( [ (sampled_node, et.n2, depth) for sampled_node in samples ] ) # finished i-th walk from node so add it to the list of walks as a list walks.append(walk) # Store current random state and restore original random state self._random_state = random.getstate() random.setstate(rs) return walks def _check_parameter_values(self, nodes, n, n_size, graph_schema, seed): """ Checks that the parameter values are valid or raises ValueError exceptions with a message indicating the parameter (the first one encountered in the checks) with invalid value. Args: nodes: A list of root node ids such that from each node n BFWs will be generated up to the given depth d. n: Number of walks per node id. n_size: The number of neighbouring nodes to expand at each depth of the walk. graph_schema: None or a stellargraph graph schema object seed: Random number generator seed; default is None """ if nodes is None: raise ValueError( "({}) A list of root node IDs was not provided (nodes parameter is None).".format( type(self).__name__ ) ) if not is_real_iterable(nodes): raise ValueError( "({}) The nodes parameter should be an iterable of node IDs.".format( type(self).__name__ ) ) if ( len(nodes) == 0 ): # this is not an error but maybe a warning should be printed to inform the caller print( "WARNING: ({}) No root node IDs given. An empty list will be returned as a result.".format( type(self).__name__ ) ) if type(n) != int: raise ValueError( "({}) The number of walks per root node, n, should be integer type.".format( type(self).__name__ ) ) if n <= 0: raise ValueError( "({}) The number of walks per root node, n, should be a positive integer.".format( type(self).__name__ ) ) if n_size is None: raise ValueError( "({}) The neighbourhood size, n_size, must be a list of integers not None.".format( type(self).__name__ ) ) if type(n_size) != list: raise ValueError( "({}) The neighbourhood size, n_size, must be a list of integers.".format( type(self).__name__ ) ) if len(n_size) == 0: raise ValueError( "({}) The neighbourhood size, n_size, should not be empty list.".format( type(self).__name__ ) ) for d in n_size: if type(d) != int: raise ValueError( "({}) The neighbourhood size, n_size, must be list of integers.".format( type(self).__name__ ) ) if d < 0: raise ValueError( "({}) n_sie should be positive integer or 0.".format( type(self).__name__ ) ) if graph_schema is not None and type(graph_schema) is not GraphSchema: raise ValueError( "({}) The parameter graph_schema should be either None or of type GraphSchema.".format( type(self).__name__ ) ) if seed is not None: if type(seed) != int: raise ValueError( "({}) The random number generator seed value, seed, should be integer type or None.".format( type(self).__name__ ) ) if seed < 0: raise ValueError( "({}) The random number generator seed value, seed, should be positive integer or None.".format( type(self).__name__ ) ) PK!N""stellargraph/data/loader.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import pandas as pd import networkx as nx from stellargraph.data.epgm import EPGM from stellargraph.core.graph import * from stellargraph import globalvar def from_epgm(epgm_location, dataset_name=None, directed=False): """ Imports a graph stored in EPGM format to a NetworkX object Args: epgm_location (str): The directory containing the EPGM data dataset_name (str), optional: The name of the dataset to import directed (bool): If True, load as a directed graph, otherwise load as an undirected graph Returns: A NetworkX graph containing the data for the EPGM-stored graph. """ G_epgm = EPGM(epgm_location) graphs = G_epgm.G["graphs"] # if dataset_name is not given, use the name of the 1st graph head if not dataset_name: dataset_name = graphs[0]["meta"]["label"] print( "WARNING: dataset name not specified, using dataset '{}' in the 1st graph head".format( dataset_name ) ) # Select graph using dataset_name for g in graphs: if g["meta"]["label"] == dataset_name: graph_id = g["id"] # Convert to StellarGraph (via nx) Gnx = G_epgm.to_nx(graph_id, directed=directed) print( "Graph statistics: {} nodes, {} edges".format( Gnx.number_of_nodes(), Gnx.number_of_edges() ) ) return Gnx def load_dataset_BlogCatalog3(location): """ This method loads the BlogCatalog3 network dataset (http://socialcomputing.asu.edu/datasets/BlogCatalog3) into a networkx undirected heterogeneous graph. The graph has two types of nodes, 'user' and 'group', and two types of edges, 'friend' and 'belongs'. The 'friend' edges connect two 'user' nodes and the 'belongs' edges connects 'user' and 'group' nodes. The node and edge types are not included in the dataset that is a collection of node and group ids along with the list of edges in the graph. Important note about the node IDs: The dataset uses integers for node ids. However, the integers from 1 to 39 are used as IDs for both users and groups. This would cause a confusion when constructing the networkx graph object. As a result, we convert all IDs to string and append the character 'u' to the integer ID for user nodes and the character 'g' to the integer ID for group nodes. Args: location: The directory where the dataset is located Returns: A networkx Graph object. """ location = os.path.expanduser(location) if not os.path.isdir(location): print("The location {} is not a directory.".format(location)) exit(0) # load the raw data user_node_ids = pd.read_csv(os.path.join(location, "nodes.csv"), header=None) group_ids = pd.read_csv(os.path.join(location, "groups.csv"), header=None) edges = pd.read_csv(os.path.join(location, "edges.csv"), header=None) group_edges = pd.read_csv(os.path.join(location, "group-edges.csv"), header=None) # convert the dataframes to lists because that is what networkx expects as input user_node_ids = user_node_ids[0].tolist() group_ids = group_ids[0].tolist() edges = list(edges.itertuples(index=False, name=None)) # convert to list of tuples group_edges = list(group_edges.itertuples(index=False, name=None)) # The dataset uses integers for node ids. However, the integers from 1 to 39 are used as IDs for both users and # groups. This would cause a confusion when constructing the networkx graph object. As a result, we convert all # IDs to string and append the character 'p' to the integer ID for user nodes and the character 'g' to the integer # ID for group nodes. user_node_ids = ["u" + str(user_node_id) for user_node_id in user_node_ids] group_ids = ["g" + str(group_id) for group_id in group_ids] edges = [("u" + str(from_node), "u" + str(to_node)) for from_node, to_node in edges] group_edges = [ ("u" + str(from_node), "g" + str(to_node)) for from_node, to_node in group_edges ] g_nx = nx.Graph() # create the graph # add user and group nodes with labels 'Person' and 'Group' respectively. g_nx.add_nodes_from(user_node_ids, label="user") g_nx.add_nodes_from(group_ids, label="group") # add the user-user edges with label 'friend' g_nx.add_edges_from(edges, label="friend") # add user-group edges with label 'belongs' g_nx.add_edges_from(group_edges, label="belongs") return g_nx PK!MMM"stellargraph/data/node_splitter.py# -*- coding: utf-8 -*- # # Copyright 2017-2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. __all__ = ["train_val_test_split", "NodeSplitter"] import numpy as np import pandas as pd from stellargraph.core.graph import StellarGraphBase from stellargraph import globalvar # Easier functional interface for the splitter: def train_val_test_split( G, node_type=None, test_size=0.4, train_size=0.2, targets=None, split_equally=False, seed=None, ): """ Splits node data into train, test, validation, and unlabeled sets. Any nodes that have a target value equal to globals.UNKNOWN_TARGET_ATTRIBUTE are added to the unlabeled set. The validation set includes all nodes that remain after the train, test and unlabeled sets have been created. As a result, it is possible the the validation set is empty. Args: G : StellarGraph containing the nodes to be split. test_size: float, int If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. train_size: float, int If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. seed: int or None, optional (default=None) If this is an int the seed will be used to initialize a random number generator, otherwith the numpy default will be used. shuffle : boolean, optional (default=True) Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. targets: None or DataFrame or dict, optional (default=None) If False the nodes are randomly assigned to each partition. If this is a Pandas DataFrame with node_ids as the index, or a dictionary with node_ids as keys and target as values, then these values will be used to find unlabelled nodes and if `split_equally` is True these target values will be used to sample equal numbers of each class. split_equally: bool (default=False) if `split_equally` is True the values passed into the targets argument will be used to sample equal numbers for the train split by class label. Returns: y_train, y_val, y_test, y_unlabeled """ node_splitter = NodeSplitter() # Get list of nodes to split if node_type is None: nodes = list(G) elif isinstance(G, StellarGraphBase): nodes = G.nodes_of_type(node_type) else: raise TypeError("G must be a StellarGraph is node_type is not None") # Number of nodes and number without a label n_nodes = len(nodes) # Extract the target information if targets is not None: # TODO: The equal sampling option will fail if these values are not hashable. # Check that split_equally_by_target is the correct type if isinstance(targets, pd.DataFrame): target_values = [ targets.loc[n] if n in targets.index else globalvar.UNKNOWN_TARGET_ATTRIBUTE for n in nodes ] elif isinstance(targets, dict): target_values = [ targets.get(n, globalvar.UNKNOWN_TARGET_ATTRIBUTE) for n in nodes ] else: raise TypeError( "The targets are expected to be either a Pandas DataFrame or a dict." ) n_known = sum(t != globalvar.UNKNOWN_TARGET_ATTRIBUTE for t in targets) else: n_known = n_nodes target_values = [0] * n_nodes if n_known == 0: raise RuntimeError("No nodes with target attribute to split.") # Find the number of nodes to use in the training set if isinstance(train_size, float) and (0 < train_size <= 1): train_size_n = int(n_known * train_size) elif isinstance(train_size, int): train_size_n = train_size else: raise ValueError("Splitter: train_size should be specified as a float or int") # Find the number of nodes to use in the test set if isinstance(test_size, float) and (0 < test_size <= 1): test_size_n = int(n_known * test_size) elif isinstance(test_size, int): test_size_n = test_size else: raise ValueError("Splitter: train_size should be specified as a float or int") # Find the number of nodes to use in the validation set val_size = None if isinstance(val_size, float) and (0 < val_size <= 1): val_size_n = int(n_known * val_size) elif isinstance(val_size, int): val_size_n = val_size else: val_size_n = max(0, n_known - (train_size_n + test_size_n)) # Check that these sizes make sense if (train_size_n + test_size_n + val_size_n) > n_known: raise ValueError( "Number of train, test and val nodes " "is greater than the total number of labelled nodes." ) # Now the splitter needs the node IDs and labels zipped together # TODO: This is a hack as the splitter only works when this array is sting type nodeid_and_label = np.array([nl for nl in enumerate(target_values)], dtype="U") # If stratified sampling, we need the target labels. if split_equally and targets is not None: class_set = set(target_values) # Remove the unknown target type class_set.discard(globalvar.UNKNOWN_TARGET_ATTRIBUTE) # The number of classes we have n_classes = len(class_set) if n_classes == 0: raise RuntimeError( "Found no usable target classes in split_equally_by_targets." ) if train_size_n < n_classes: raise RuntimeError( "The number of classes must be smaller than the training size." ) # The number of nodes we want per class p = int(train_size_n / n_classes) splits = node_splitter.train_test_split( y=nodeid_and_label, p=p, method="count", test_size=test_size_n, seed=seed ) else: splits = node_splitter.train_test_split( y=nodeid_and_label, method="absolute", train_size=train_size_n, test_size=test_size_n, seed=seed, ) # Get the node_ids out of the splitter node_ids_out = [[nodes[int(ind)] for ind in split[:, 0]] for split in splits] return node_ids_out class NodeSplitter(object): def __init__(self): self.format_epgm = False self.g_epgm = None self.g_id = None self._random = None def _get_nodes(self, graph_nodes, node_type, target_attribute): """ Returns a list of node IDs for the subset of graph_nodes that have the given node type. Args: graph_nodes: List of OrderedDict with vertex data for graph in EPGM format node_type: The node type of interest target_attribute: The target attribute key Returns: List of node IDs that have given node type """ # This code will fail if a node of node_type is missing the target_attribute. # We can fix this by using node['data'].get(target_attribute, None) so that at least all nodes of the # given type are returned. However, we must check for None in target_attribute later to exclude these nodes # from being added to train, test, and validation datasets. y = [ ( node["id"], node["data"].get(target_attribute, globalvar.UNKNOWN_TARGET_ATTRIBUTE), ) for node in graph_nodes if node["meta"][globalvar.TYPE_ATTR_NAME] == node_type ] return y def _check_parameters(self, y, p, method, test_size, train_size, seed): """ Checks that the parameters have valid values. It not, then it raises a ValueError exception with a message corresponding to the invalid parameter. Args: y: Array of size Nx2 containing node id, label columns. p: Percent or count of the number of points for each class to sample. method: One of 'count', 'percent', or 'absolute'. test_size: number of points in the test set. For method 'count', it should be less than or equal to N - (np.unique(labels) * nc) where N is the number of labeled points in y. train_size: The number of points in the train set only used by method 'absolute'. seed: seed for random number generator, positive int or 0 """ if not isinstance(y, np.ndarray): raise ValueError("({}) y should be numpy array".format(type(self).__name__)) if method != "count" and method != "percent" and method != "absolute": raise ValueError( "({}) Valid methods are 'count', 'percent', and 'absolute' not {}".format( type(self).__name__, method ) ) if seed is not None: if seed < 0: raise ValueError( "({}) The random number generator seed value, seed, should be positive integer or None.".format( type(self).__name__ ) ) if not isinstance(seed, int): raise ValueError( "({}) The random number generator seed value, seed, should be integer type or None.".format( type(self).__name__ ) ) if method == "count": if not isinstance(p, int) or p <= 0: raise ValueError( "({}) p should be positive integer".format(type(self).__name__) ) if test_size is None or not isinstance(test_size, int) or test_size <= 0: raise ValueError( "({}) test_size must be positive integer".format( type(self).__name__ ) ) elif method == "percent": if not isinstance(p, float) or p < 0. or p > 1.: raise ValueError( "({}) p should be float in the range [0,1].".format( type(self).__name__ ) ) elif method == "absolute": if test_size is None or not isinstance(test_size, int) or test_size <= 0: raise ValueError( "({}) test_size should be positive integer".format( type(self).__name__ ) ) if train_size is None or not isinstance(train_size, int) or train_size <= 0: raise ValueError( "({}) train_size should be positive integer".format( type(self).__name__ ) ) def train_test_split( self, y=None, p=10, method="count", test_size=None, train_size=None, seed=None ): """ Splits node data into train, test, validation, and unlabeled sets. Any points in y that have value globals.UNKNOWN_TARGET_ATTRIBUTE are added to the unlabeled set. The validation set includes all the point that remain after the train, test and unlabeled sets have been created. As a result, it is possible the the validation set is empty, e.g., when method is set to 'percent'. The train, and test sets are build based on the specified method, 'count', 'percent', or 'absolute'. method='count': The value of parameter p specifies the number of points in the train set for each class. The test set size must be specified using the test_size parameter. method='percent': The value of parameter p specifies the train set size (and 1-p the test set size) as a percentage of the total number of points in y (including the unlabeled points.) The split is performed uniformly at random and the point labels (as specified in y) are not taken into account. method='absolute': The values of the parameters train_size and test_size specify the size of the train and test sets respectively. Points are selected uniformly at random and the label (as specified in y) are not taken into account. Args: y: Array of size Nx2 containing node id, label columns. p: Percent or count of the number of points for each class to sample. method: One of 'count', 'percent', or 'absolute'. test_size: number of points in the test set. For method 'count', it should be less than or equal to N - (np.unique(labels) * nc) where N is the number of labeled points in y. train_size: The number of points in the train set only used by method 'absolute'. seed: seed for random number generator, positive int or 0 Returns: y_train, y_val, y_test, y_unlabeled """ self._check_parameters( y=y, p=p, method=method, test_size=test_size, train_size=train_size, seed=seed, ) if self._random is None: self._random = np.random.RandomState(seed=seed) if method == "count": return self._split_data(y, p, test_size) elif method == "percent": n_unlabelled_points = np.sum(y[:, 1] == globalvar.UNKNOWN_TARGET_ATTRIBUTE) train_size = int((y.shape[0] - n_unlabelled_points) * p) test_size = y.shape[0] - n_unlabelled_points - train_size return self._split_data_absolute( y=y, test_size=test_size, train_size=train_size ) elif method == "absolute": return self._split_data_absolute( y=y, test_size=test_size, train_size=train_size ) def _split_data_absolute(self, y, test_size, train_size): """ Splits given data such that the sizes of the test and train sets are fixed to the values given. Args: y: Array of size N x 2 containing node id + labels. test_size: number of points in test set. train_size: The number of points in the train set. Returns: y_train, y_val, y_test, y_unlabeled """ # The label column in y could include None type, that is point with no ground truth label. These, if any,will # be returned separately in y_unlabeled dataset y_used = np.zeros(y.shape[0]) # initialize all the points are available # indexes of points with no class label: ind = np.nonzero(y[:, 1] == globalvar.UNKNOWN_TARGET_ATTRIBUTE) y_unlabeled = y[ind] y_used[ind] = 1 ind = np.nonzero(y_used == 0) # unused points ind_sampled = self._random.choice(ind[0], train_size, replace=False) y_train = y[ind_sampled] # mark these as used to make sure that they are not sampled for the test set y_used[ind_sampled] = 1 # now sample test_size points for the test set ind = np.nonzero(y_used == 0) # indexes of points that are not in training set if len(ind[0]) < test_size: raise Exception( "Not enough nodes available for the test set: available {} nodes, needed {}. Aborting".format( len(ind[0]), test_size ) ) ind_sampled = self._random.choice(ind[0], test_size, replace=False) y_test = y[ind_sampled] y_used[ind_sampled] = 1 # print("y_test shape: ", y_test.shape) # Validation set # the remaining labeled points (if any) go into the validation set ind = np.nonzero(y_used == 0) y_val = y[ind[0]] # print("y_val shape:", y_val.shape) return y_train, y_val, y_test, y_unlabeled def _split_data(self, y, nc, test_size): """ Splits the data according to the scheme in Yang et al, ICML 2016, Revisiting semi-supervised learning with graph embeddings. Args: y: Array of size N x 2 containing node id + labels. nc: number of points from each class in train set. test_size: number of points in test set; it should be less than or equal to N - (np.unique(labels) * nc). Returns: y_train, y_val, y_test, y_unlabeled """ y_used = np.zeros(y.shape[0]) # initialize all the points are available # indexes of points with no class label: ind = np.nonzero(y[:, 1] == globalvar.UNKNOWN_TARGET_ATTRIBUTE) y_unlabeled = y[ind] y_used[ind] = 1 y_train = None class_labels = np.unique(y[:, 1]) ind = class_labels == globalvar.UNKNOWN_TARGET_ATTRIBUTE class_labels = class_labels[np.logical_not(ind)] if test_size > y.shape[0] - class_labels.size * nc: # re-adjust so that none of the training samples end up in the test set test_size = y.shape[0] - class_labels.size * nc for clabel in class_labels: # indexes of points with class label clabel: ind = np.nonzero(y[:, 1] == clabel) # select nc of these at random for the training set if ind[0].size <= nc: # too few labeled examples for class so use half for training and half for testing ind_selected = self._random.choice( ind[0], ind[0].size // 2, replace=False ) else: ind_selected = self._random.choice(ind[0], nc, replace=False) # mark these as used to make sure that they are not sampled for the test set: y_used[ind_selected] = 1 if y_train is None: y_train = y[ind_selected] else: # print("y_train shape:", y_train.shape) # print("y[ind_selected] shape:", y[ind_selected].shape) y_train = np.vstack((y_train, y[ind_selected])) # now sample test_size points for the test set ind = np.nonzero(y_used == 0) # indexes of points that are not in training set if len(ind[0]) < test_size: raise Exception( "Not enough nodes available for the test set: available {} nodes, needed {}. Aborting".format( len(ind[0]), test_size ) ) ind_selected = self._random.choice(ind[0], test_size, replace=False) y_test = y[ind_selected] y_used[ind_selected] = 1 # print("y_test shape: ", y_test.shape) # the remaining points (if any) go into the validation set ind = np.nonzero(y_used == 0) y_val = y[ind[0]] # print("y_val shape:", y_val.shape) return y_train, y_val, y_test, y_unlabeled PK!stellargraph/globalvar.py# This file contains global attributes used throughout stellargraph FEATURE_ATTR_NAME = "feature" TARGET_ATTR_NAME = "target" TYPE_ATTR_NAME = "label" UNKNOWN_TARGET_ATTRIBUTE = "-1" PK!:>OAhhstellargraph/layer/__init__.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ The layer package contains implementations of popular neural network layers for graph ML as Keras layers """ # __all__ = ["graphsage", "hinsage", "link_inference"] # Expose the layers from .graphsage import * from .hinsage import * from .link_inference import * PK!%**stellargraph/layer/graphsage.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ GraphSAGE and compatible aggregator layers """ __all__ = ["GraphSAGE", "MeanAggregator"] import numpy as np from keras.engine.topology import Layer from keras import Input from keras import backend as K from keras.layers import Lambda, Dropout, Reshape from keras.utils import Sequence from keras import activations from typing import List, Tuple, Callable, AnyStr class MeanAggregator(Layer): """ Mean Aggregator for GraphSAGE implemented with Keras base layer Args: output_dim (int): Output dimension bias (bool): Optional bias act (Callable or str): name of the activation function to use (must be a Keras activation function), or alternatively, a TensorFlow operation. """ def __init__( self, output_dim: int = 0, bias: bool = False, act: Callable or AnyStr = "relu", **kwargs ): self.output_dim = output_dim assert output_dim % 2 == 0 self.half_output_dim = int(output_dim / 2) self.has_bias = bias self.act = activations.get(act) self.w_neigh = None self.w_self = None self.bias = None self._initializer = "glorot_uniform" super().__init__(**kwargs) def get_config(self): """ Gets class configuration for Keras serialization """ config = { "output_dim": self.output_dim, "bias": self.has_bias, "act": activations.serialize(self.act), } base_config = super().get_config() return {**base_config, **config} def build(self, input_shape): """ Builds layer Args: input_shape (list of list of int): Shape of input tensors for self and neighbour """ self.w_neigh = self.add_weight( name="w_neigh", shape=(input_shape[1][3], self.half_output_dim), initializer=self._initializer, trainable=True, ) self.w_self = self.add_weight( name="w_self", shape=(input_shape[0][2], self.half_output_dim), initializer=self._initializer, trainable=True, ) if self.has_bias: self.bias = self.add_weight( name="bias", shape=[self.output_dim], initializer="zeros", trainable=True, ) super().build(input_shape) def call(self, x, **kwargs): """ Apply MeanAggregation on input tensors, x Args: x: Keras Tensor Returns: Keras Tensor representing the aggregated embeddings in the input. """ neigh_means = K.mean(x[1], axis=2) from_self = K.dot(x[0], self.w_self) from_neigh = K.dot(neigh_means, self.w_neigh) total = K.concatenate([from_self, from_neigh], axis=2) return self.act(total + self.bias if self.has_bias else total) def compute_output_shape(self, input_shape): """ Computes the output shape of the layer. Assumes that the layer will be built to match that input shape provided. Args: input_shape (tuple of ints) Shape tuples can include None for free dimensions, instead of an integer. Returns: An input shape tuple. """ return input_shape[0][0], input_shape[0][1], self.output_dim class GraphSAGE: """ Implementation of the GraphSAGE algorithm with Keras layers. Args: layer_sizes (list): Hidden feature dimensions for each layer generator (Sequence): A NodeSequence or LinkSequence. If specified the n_samples and input_dim will be taken from this object. n_samples (list): (Optional: needs to be specified if no mapper is provided.) The number of samples per layer in the model. input_dim (int): The dimensions of the node features used as input to the model. aggregator (class): The GraphSAGE aggregator to use. Defaults to the `MeanAggregator`. bias (bool): If True a bias vector is learnt for each layer in the GraphSAGE model dropout (float): The dropout supplied to each layer in the GraphSAGE model. normalize (str): The normalization used after each layer, defaults to L2 normalization. """ def __init__( self, layer_sizes, generator=None, n_samples=None, input_dim=None, aggregator=None, bias=True, dropout=0., normalize="l2", ): # Set the aggregator layer used in the model if aggregator is None: self._aggregator = MeanAggregator elif issubclass(aggregator, Layer): self._aggregator = aggregator else: raise TypeError("Aggregator should be a subclass of Keras Layer") # Set the normalization layer used in the model if normalize == "l2": self._normalization = Lambda(lambda x: K.l2_normalize(x, axis=2)) elif normalize is None or normalize == "none": self._normalization = Lambda(lambda x: x) # Get the input_dim and num_samples from the mapper if it is given # Use both the schema and head node type from the mapper # TODO: Refactor the horror of generator.generator.graph... if generator is not None: self.n_samples = generator.generator.num_samples feature_sizes = generator.generator.graph.node_feature_sizes() if len(feature_sizes) > 1: raise RuntimeError( "GraphSAGE called on graph with more than one node type." ) self.input_feature_size = feature_sizes.popitem()[1] elif n_samples is not None and input_dim is not None: self.n_samples = n_samples self.input_feature_size = input_dim else: raise RuntimeError( "If mapper is not provided, n_samples and input_dim must be specified." ) # Model parameters self.n_layers = len(self.n_samples) self.bias = bias self.dropout = dropout # Feature dimensions for each layer self.dims = [self.input_feature_size] + layer_sizes # Aggregator functions for each layer self._aggs = [ self._aggregator( output_dim=self.dims[layer + 1], bias=self.bias, act="relu" if layer < self.n_layers - 1 else "linear", ) for layer in range(self.n_layers) ] # Sizes of the neighbours for each layer self._neigh_reshape = [ [ Reshape((-1, max(1, self.n_samples[i]), self.dims[layer])) for i in range(self.n_layers - layer) ] for layer in range(self.n_layers) ] self._normalization = Lambda(lambda x: K.l2_normalize(x, 2)) def __call__(self, x: List): """ Apply aggregator layers Args: x (list of Tensor): Batch input features Returns: Output tensor """ def compose_layers(_x: List, layer: int): """ Function to recursively compose aggregation layers. When current layer is at final layer, then length of _x should be 1, and compose_layers(_x, layer) returns _x[0]. Args: _x: List of feature matrix tensors layer: Current layer index Returns: _x computed from current layer to output layer """ def x_next(agg): """ Compute the list of tensors for the next layer Args: agg (Layer): Aggregator layer to apply Returns: Output list of tensors of applying the aggregator to inputs """ return [ agg( [ Dropout(self.dropout)(_x[i]), Dropout(self.dropout)( self._neigh_reshape[layer][i](_x[i + 1]) ), ] ) for i in range(self.n_layers - layer) ] return ( compose_layers(x_next(self._aggs[layer]), layer + 1) if layer < self.n_layers else _x[0] ) assert isinstance(x, list), "Input features must be a list" assert ( len(x) == self.n_layers + 1 > 1 ), "Length of input features should match the number of GraphSAGE layers" return self._normalization(compose_layers(x, 0)) def _input_shapes(self) -> List[Tuple[int, int]]: """ Returns the input shapes for the tensors at each layer Returns: A list of tuples giving the shape (number of nodes, feature size) for the corresponding layer """ def shape_at(i: int) -> Tuple[int, int]: return ( max(1, np.product(self.n_samples[:i], dtype=int)), self.input_feature_size, ) input_shapes = [shape_at(i) for i in range(self.n_layers + 1)] return input_shapes def default_model(self, flatten_output=False): """ Return model with default inputs Args: flatten_output: The GraphSAGE model will return an output tensor of form (batch_size, 1, feature_size). If this flag is true, the output will be of size (batch_size, 1*feature_size) Returns: tuple: (x_inp, x_out) where ``x_inp`` is a list of Keras input tensors for the specified GraphSAGE model and ``x_out`` is tne Keras tensor for the GraphSAGE model output. """ # Create tensor inputs x_inp = [Input(shape=s) for s in self._input_shapes()] # Output from GraphSAGE model x_out = self(x_inp) if flatten_output: x_out = Reshape((-1,))(x_out) return x_inp, x_out PK!6@~};};stellargraph/layer/hinsage.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Heterogeneous GraphSAGE and compatible aggregator layers """ __all__ = ["HinSAGE", "MeanHinAggregator"] from keras.engine.topology import Layer from keras import backend as K, Input from keras.layers import Lambda, Dropout, Reshape from keras.utils import Sequence from keras import activations from typing import List, Callable, Tuple, Dict, Union, AnyStr import itertools as it import operator as op class MeanHinAggregator(Layer): """Mean Aggregator for HinSAGE implemented with Keras base layer Args: output_dim (int): Output dimension bias (bool): Use bias in layer or not (Default False) act (Callable or str): name of the activation function to use (must be a Keras activation function), or alternatively, a TensorFlow operation. """ def __init__( self, output_dim: int = 0, bias: bool = False, act: Union[Callable, AnyStr] = "relu", **kwargs ): self.output_dim = output_dim assert output_dim % 2 == 0 self.half_output_dim = int(output_dim / 2) self.has_bias = bias self.act = activations.get(act) self.nr = None self.w_neigh = [] self.w_self = None self.bias = None self._initializer = "glorot_uniform" super().__init__(**kwargs) def get_config(self): """ Gets class configuration for Keras serialization """ config = { "output_dim": self.output_dim, "bias": self.has_bias, "act": activations.serialize(self.act), } base_config = super().get_config() return {**base_config, **config} def build(self, input_shape): """ Builds layer Args: input_shape (list of list of int): Shape of input per neighbour type. """ # Weight matrix for each type of neighbour self.nr = len(input_shape) - 1 self.w_neigh = [ self.add_weight( name="w_neigh_" + str(r), shape=(input_shape[1 + r][3], self.half_output_dim), initializer=self._initializer, trainable=True, ) for r in range(self.nr) ] # Weight matrix for self self.w_self = self.add_weight( name="w_self", shape=(input_shape[0][2], self.half_output_dim), initializer=self._initializer, trainable=True, ) # Optional bias if self.has_bias: self.bias = self.add_weight( name="bias", shape=[self.output_dim], initializer="zeros", trainable=True, ) super().build(input_shape) def call(self, x, **kwargs): """ Apply MeanAggregation on input tensors, x Args: x: Keras Tensor Returns: Keras Tensor representing the aggregated embeddings in the input. """ neigh_means = [K.mean(z, axis=2) for z in x[1:]] from_self = K.dot(x[0], self.w_self) from_neigh = ( sum([K.dot(neigh_means[r], self.w_neigh[r]) for r in range(self.nr)]) / self.nr ) total = K.concatenate( [from_self, from_neigh], axis=2 ) # YT: this corresponds to concat=Partial # TODO: implement concat=Full and concat=False return self.act(total + self.bias if self.has_bias else total) def compute_output_shape(self, input_shape): """ Computes the output shape of the layer. Assumes that the layer will be built to match that input shape provided. Args: input_shape (tuple of ints) Shape tuples can include None for free dimensions, instead of an integer. Returns: An input shape tuple. """ return input_shape[0][0], input_shape[0][1], self.output_dim class HinSAGE: """ Implementation of the GraphSAGE algorithm extended for heterogeneous graphs with Keras layers. """ def __init__( self, layer_sizes, generator=None, n_samples=None, input_neighbor_tree=None, input_dim=None, aggregator=None, bias=True, dropout=0., normalize="l2", ): """ Args: layer_sizes (list of int): Hidden feature dimensions for each layer mapper (Sequence): A HinSAGENodeMapper or HinSAGELinkMapper. If specified the n_samples, input_neighbour_tree and input_dim will be taken from this object. n_samples: (Optional: needs to be specified if no mapper is provided.) The number of samples per layer in the model. input_neighbor_tree: A list of (node_type, [children]) tuples that specify the subtree to be created by the HinSAGE model. input_dim: The input dimensions for each node type as a dictionary of the form {node_type: feature_size}. aggregator: The HinSAGE aggregator to use. Defaults to the `MeanHinAggregator`. bias: If True a bias vector is learnt for each layer in the HinSAGE model dropout: The dropout supplied to each layer in the HinSAGE model. normalize: The normalization used after each layer, defaults to L2 normalization. """ def eval_neigh_tree_per_layer(input_tree): """ Function to evaluate the neighbourhood tree structure for every layer. The tree structure at each layer is a truncated version of the previous layer. Args: input_tree: Neighbourhood tree for the input batch Returns: List of neighbourhood trees """ reduced = [ li for li in input_tree if all(li_neigh < len(input_tree) for li_neigh in li[1]) ] return ( [input_tree] if len(reduced) == 0 else [input_tree] + eval_neigh_tree_per_layer(reduced) ) # Set the aggregator layer used in the model if aggregator is None: self._aggregator = MeanHinAggregator elif issubclass(aggregator, Layer): self._aggregator = aggregator else: raise TypeError("Aggregator should be a subclass of Keras Layer") # Set the normalization layer used in the model if normalize == "l2": self._normalization = Lambda(lambda x: K.l2_normalize(x, axis=2)) elif normalize is None or normalize == "none": self._normalization = Lambda(lambda x: x) # Get the sampling tree, input_dim, and num_samples from the mapper if it is given # Use both the schema and head node type from the mapper # TODO: Refactor the horror of generator.generator.graph... if generator is not None: self.n_samples = generator.generator.num_samples self.subtree_schema = generator.generator.schema.type_adjacency_list( generator.head_node_types, len(self.n_samples) ) self.input_dims = generator.generator.graph.node_feature_sizes() elif ( input_neighbor_tree is not None and n_samples is not None and input_dim is not None ): self.subtree_schema = input_neighbor_tree self.n_samples = n_samples self.input_dims = input_dim else: raise RuntimeError( "If mapper is not provided, input_neighbour_tree, n_samples," " and input_dim must be specified." ) # Set parameters for the model self.n_layers = len(self.n_samples) self.bias = bias self.dropout = dropout # Neighbourhood info per layer self.neigh_trees = eval_neigh_tree_per_layer( [li for li in self.subtree_schema if len(li[1]) > 0] ) # Depth of each input i.e. number of hops from root nodes depth = [ self.n_layers + 1 - sum([1 for li in [self.subtree_schema] + self.neigh_trees if i < len(li)]) for i in range(len(self.subtree_schema)) ] # Dict of {node type: dimension} per layer self.dims = [ dim if isinstance(dim, dict) else {k: dim for k, _ in ([self.subtree_schema] + self.neigh_trees)[layer]} for layer, dim in enumerate([self.input_dims] + layer_sizes) ] # Dict of {node type: aggregator} per layer self._aggs = [ { node_type: self._aggregator( output_dim, bias=self.bias, act="relu" if layer < self.n_layers - 1 else "linear", ) for node_type, output_dim in self.dims[layer + 1].items() } for layer in range(self.n_layers) ] # Reshape object per neighbour per node per layer self._neigh_reshape = [ [ [ Reshape( ( -1, self.n_samples[depth[i]], self.dims[layer][self.subtree_schema[neigh_index][0]], ) ) for neigh_index in neigh_indices ] for i, (_, neigh_indices) in enumerate(self.neigh_trees[layer]) ] for layer in range(self.n_layers) ] def __call__(self, x: List): """ Apply aggregator layers Args: x (list of Tensor): Batch input features Returns: Output tensor """ def compose_layers(x: List, layer: int): """ Function to recursively compose aggregation layers. When current layer is at final layer, then compose_layers(x, layer) returns x. Args: x (list of Tensor): List of feature matrix tensors layer (int): Current layer index Returns: x computed from current layer to output layer """ def neigh_list(i, neigh_indices): """ Get the correctly-shaped list of neighbour tensors for the tensor at index i Args: i (int): Tensor index neigh_indices (list of int): list of indices of the neighbour tensors Returns: List of neighbour tensors """ return [ self._neigh_reshape[layer][i][ni](x[neigh_index]) for ni, neigh_index in enumerate(neigh_indices) ] def x_next(agg: Dict[str, Layer]): """ Compute the list of tensors for the next layer Args: agg (Dict[str, Layer]): Dict of node type to aggregator layer Returns: Outputs of applying the aggregators as a list of Tensors """ return [ agg[node_type]( [ Dropout(self.dropout)(x[i]), *[ Dropout(self.dropout)(ne) for ne in neigh_list(i, neigh_indices) ], ], name="{}_{}".format(node_type, layer), ) for i, (node_type, neigh_indices) in enumerate( self.neigh_trees[layer] ) ] return ( compose_layers(x_next(self._aggs[layer]), layer + 1) if layer < self.n_layers else x ) x = compose_layers(x, 0) return ( self._normalization(x[0]) if len(x) == 1 else [self._normalization(xi) for xi in x] ) def _input_shapes(self) -> List[Tuple[int, int]]: """ Returns the input shapes for the tensors of the supplied neighbourhood type tree Returns: A list of tuples giving the shape (number of nodes, feature size) for the corresponding item in the neighbourhood type tree (self.subtree_schema) """ neighbor_sizes = list(it.accumulate([1] + self.n_samples, op.mul)) def get_shape(stree, cnode, level=0): adj = stree[cnode][1] size_dict = { cnode: (neighbor_sizes[level], self.input_dims[stree[cnode][0]]) } if len(adj) > 0: size_dict.update( { k: s for a in adj for k, s in get_shape(stree, a, level + 1).items() } ) return size_dict input_shapes = dict() for ii in range(len(self.subtree_schema)): input_shapes_ii = get_shape(self.subtree_schema, ii) # Update input_shapes if input_shapes_ii.keys() are not already in input_shapes.keys(): if ( len(set(input_shapes_ii.keys()).intersection(set(input_shapes.keys()))) == 0 ): input_shapes.update(input_shapes_ii) return [input_shapes[ii] for ii in range(len(self.subtree_schema))] def default_model(self, flatten_output=False): """ Return model with default inputs Args: flatten_output (bool): The HinSAGE model returns an output tensor of form (batch_size, 1, feature_size) - if this flag is True, the output will be resized to (batch_size, feature_size) Returns: tuple: (x_inp, x_out) where ``x_inp`` is a list of Keras input tensors for the specified HinSAGE model and ``x_out`` is tne Keras tensor for the HinSAGE model output. """ # Create tensor inputs x_inp = [Input(shape=s) for s in self._input_shapes()] # Output from GraphSAGE model x_out = self(x_inp) if flatten_output: x_out = Reshape((-1,))(x_out) return x_inp, x_out PK!Ԉ~'$'$$stellargraph/layer/link_inference.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Link inference functions for link classification (including link prediction) and link attribute inference (regression) """ from typing import AnyStr, Optional, List, Tuple from keras.layers import Layer, Concatenate, Dense, Lambda, Multiply, Average, Reshape from keras import backend as K class LeakyClippedLinear(Layer): """ Leaky Clipped Linear Unit. Args: low (float): Lower threshold high (float): Lower threshold alpha (float) The slope of the function below low or above high. """ def __init__( self, low: float = 1.0, high: float = 5.0, alpha: float = 0.1, **kwargs ): super().__init__(**kwargs) self.supports_masking = True self.gamma = K.cast_to_floatx(1 - alpha) self.lo = K.cast_to_floatx(low) self.hi = K.cast_to_floatx(high) def call(self, x, mask=None): x_lo = K.relu(self.lo - x) x_hi = K.relu(x - self.hi) return x + self.gamma * x_lo - self.gamma * x_hi def get_config(self): config = { "alpha": float(1 - self.gamma), "low": float(self.lo), "high": float(self.hi), } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) def compute_output_shape(self, input_shape): return input_shape def link_inference( output_dim: int = 1, output_act: AnyStr = "linear", edge_feature_method: AnyStr = "ip", clip_limits: Optional[Tuple[float]] = None, name: AnyStr = "link_inference", ): """ Defines an edge inference function that takes source, destination node features as input, and returns a numeric vector of output_dim size. Args: output_dim (int): Number of predictor's output units -- desired dimensionality of the output. output_act (str), optional: activation function applied to the output, one of "softmax", "sigmoid", etc., or any activation function supported by Keras, see https://keras.io/activations/ for more information. edge_feature_method (str), optional: Name of the method of combining (src,dst) node features into edge features. One of * 'concat' -- concatenation, * 'ip' or 'dot' -- inner product, :math:`ip(u,v) = sum_{i=1..d}{u_i*v_i}`, * 'mul' or 'hadamard' -- element-wise multiplication, :math:`h(u,v)_i = u_i*v_i`, * 'l1' -- :math:`l_1(u,v)_i = |u_i-v_i|`, * 'l2' -- :math:`l_2(u,v)_i = (u_i-v_i)^2`, * 'avg' -- :math:`avg(u,v) = (u+v)/2`. clip_limits (Tuple[float]): lower and upper thresholds for LeakyClippedLinear unit on top. If None (not provided), the LeakyClippedLinear unit is not applied. name (str): optional name of the defined function, used for error logging Returns: Function taking edge tensors with src, dst node features (i.e., pairs of (node_src, node_dst) tensors) and returning a vector of output_dim length (e.g., edge class probabilities, edge attribute prediction, etc.). """ def edge_function(x): x0 = x[0] x1 = x[1] if edge_feature_method == "ip" or edge_feature_method == "dot": out = Lambda(lambda x: K.sum(x[0] * x[1], axis=-1, keepdims=False))( [x0, x1] ) elif edge_feature_method == "l1": # l1(u,v)_i = |u_i - v_i| - vector of the same size as u,v le = Lambda(lambda x: K.abs(x[0] - x[1]))([x0, x1]) # add dense layer to convert le to the desired output: out = Dense(output_dim, activation=output_act)(le) out = Reshape((output_dim,))(out) elif edge_feature_method == "l2": # l2(u,v)_i = (u_i - v_i)^2 - vector of the same size as u,v le = Lambda(lambda x: K.square(x[0] - x[1]))([x0, x1]) # add dense layer to convert le to the desired output: out = Dense(output_dim, activation=output_act)(le) out = Reshape((output_dim,))(out) elif edge_feature_method == "mul" or edge_feature_method == "hadamard": le = Multiply()([x0, x1]) # add dense layer to convert le to the desired output: out = Dense(output_dim, activation=output_act)(le) out = Reshape((output_dim,))(out) elif edge_feature_method == "concat": le = Concatenate()([x0, x1]) # add dense layer to convert le to the desired output: out = Dense(output_dim, activation=output_act)(le) out = Reshape((output_dim,))(out) elif edge_feature_method == "avg": le = Average()([x0, x1]) # add dense layer to convert le to the desired output: out = Dense(output_dim, activation=output_act)(le) out = Reshape((output_dim,))(out) else: raise NotImplementedError( "{}: the requested method '{}' is not known/not implemented".format( name, edge_feature_method ) ) if clip_limits: out = LeakyClippedLinear( low=clip_limits[0], high=clip_limits[1], alpha=0.1 )(out) return out print( "{}: using '{}' method to combine node embeddings into edge embeddings".format( name, edge_feature_method ) ) return edge_function def link_classification( output_dim: int = 1, output_act: AnyStr = "sigmoid", edge_feature_method: AnyStr = "ip", ): """ Defines a function that predicts a binary or multi-class edge classification output from (source, destination) node features. Args: output_dim (int): Number of classifier's output units -- desired dimensionality of the output, output_act (str), optional: activation function applied to the output, one of "softmax", "sigmoid", etc., or any activation function supported by Keras, see https://keras.io/activations/ for more information. edge_feature_method (str), optional: Name of the method of combining (src,dst) node features into edge features. One of: * 'concat' -- concatenation, * 'ip' or 'dot' -- inner product, :math:`ip(u,v) = sum_{i=1..d}{u_i*v_i}`, * 'mul' or 'hadamard' -- element-wise multiplication, :math:`h(u,v)_i = u_i*v_i`, * 'l1' -- :math:`l_1(u,v)_i = |u_i-v_i|`, * 'l2' -- :math:`l_2(u,v)_i = (u_i-v_i)^2`, * 'avg' -- :math:`avg(u,v) = (u+v)/2`. Returns: Function taking edge tensors with src, dst node features (i.e., pairs of (node_src, node_dst) tensors) and returning logits of output_dim length (e.g., edge class probabilities). """ edge_function = link_inference( output_dim=output_dim, output_act=output_act, edge_feature_method=edge_feature_method, name="link_classification", ) return edge_function def link_regression( output_dim: int = 1, clip_limits: Optional[Tuple[float]] = None, edge_feature_method: AnyStr = "ip", ): """ Defines a function that predicts a numeric edge regression output vector/scalar from (source, destination) node features. Args: output_dim (int): Number of classifier's output units -- desired dimensionality of the output, clip_limits (tuple): lower and upper thresholds for LeakyClippedLinear unit on top. If None (not provided), the LeakyClippedLinear unit is not applied. edge_feature_method (str), optional: Name of the method of combining (src,dst) node features into edge features. One of: * 'concat' -- concatenation, * 'ip' or 'dot' -- inner product, :math:`ip(u,v) = sum_{i=1..d}{u_i*v_i}`, * 'mul' or 'hadamard' -- element-wise multiplication, :math:`h(u,v)_i = u_i*v_i`, * 'l1' -- :math:`l_1(u,v)_i = |u_i-v_i|`, * 'l2' -- :math:`l_2(u,v)_i = (u_i-v_i)^2`, * 'avg' -- :math:`avg(u,v) = (u+v)/2`. Returns: Function taking edge tensors with src, dst node features (i.e., pairs of (node_src, node_dst) tensors) and returning a numeric value (e.g., edge attribute being predicted) constructed according to edge_feature_method. """ edge_function = link_inference( output_dim=output_dim, output_act="linear", edge_feature_method=edge_feature_method, clip_limits=clip_limits, name="link_regression", ) return edge_function PK![*==stellargraph/mapper/__init__.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ The mapper package contains classes and functions to map graph data to neural network inputs """ # __all__ = ["link_mappers", "node_mappers"] # Expose the mappers from .node_mappers import * from .link_mappers import * PK!DD#stellargraph/mapper/link_mappers.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Generators that create batches of data from a machine-learnign ready graph for link prediction/link attribute inference problems using GraphSAGE and HinSAGE. """ __all__ = ["LinkSequence", "GraphSAGELinkGenerator", "HinSAGELinkGenerator"] from stellargraph.core.graph import StellarGraphBase import numpy as np import itertools as it import operator from functools import reduce import keras from keras.utils import Sequence from stellargraph.data.explorer import ( SampledBreadthFirstWalk, SampledHeterogeneousBreadthFirstWalk, ) from ..core.utils import is_real_iterable class LinkSequence(Sequence): """Keras-compatible data generator to use with Keras methods :meth:`keras.Model.fit_generator`, :meth:`keras.Model.evaluate_generator`, and :meth:`keras.Model.predict_generator` This class generates data samples for link inference models and should be created using the :meth:`flow` method of :class:`GraphSAGELinkGenerator` or :class:`HinSAGELinkGenerator` . Args: generator: An instance of :class:`GraphSAGELinkGenerator` or :class:`HinSAGELinkGenerator`. ids: Link IDs to batch, each link id being a tuple of (src, dst) node ids. (The graph nodes must have a "feature" attribute that is used as input to the GraphSAGE model.) These are the links that are to be used to train or inference, and the embeddings calculated for these links via a binary operator applied to their source and destination nodes, are passed to the downstream task of link prediction or link attribute inference. The source and target nodes of the links are used as head nodes for which subgraphs are sampled. The subgraphs are sampled from all nodes. targets: Labels corresponding to the above links, e.g., 0 or 1 for the link prediction problem. node_types: Node types of the target edges """ def __init__(self, generator, ids, targets=None): # Check that ids is an iterable if not is_real_iterable(ids): raise TypeError("IDs must be an iterable or numpy array of graph node IDs") # Check targets is iterable & has the correct length if targets is not None: if not is_real_iterable(targets): raise TypeError("Targets must be None or an iterable or numpy array ") if len(ids) != len(targets): raise ValueError( "The length of the targets must be the same as the length of the ids" ) # Ensure number of labels matches number of ids if targets is not None and len(ids) != len(targets): raise ValueError("Length of link ids must match length of link targets") self.generator = generator self.ids = list(ids) self.targets = np.asanyarray(targets) self.data_size = len(self.ids) # Get head node types from all src, dst nodes extracted from all links, # and make sure there's only one pair of node types: self.head_node_types = self._infer_head_node_types(generator.schema) self._sampling_schema = generator.schema.sampling_layout( self.head_node_types, generator.num_samples ) self.type_adjacency_list = generator.schema.type_adjacency_list( self.head_node_types, len(generator.num_samples) ) def _infer_head_node_types(self, schema): """Get head node types from all src, dst nodes extracted from all links in self.ids""" head_node_types = [] for src, dst in self.ids: # loop over all edges in self.ids head_node_types.append(tuple(schema.get_node_type(v) for v in (src, dst))) head_node_types = list(set(head_node_types)) if len(head_node_types) != 1: raise RuntimeError( "All (src,dst) node types for inferred links must be of the same type!" ) return head_node_types.pop() def __len__(self): "Denotes the number of batches per epoch" return int(np.ceil(self.data_size / self.generator.batch_size)) def __getitem__(self, batch_num): """ Generate one batch of data Args: batch_num (int): number of a batch Returns: batch_feats (list): Node features for nodes and neighbours sampled from a batch of the supplied IDs batch_targets (list): Targets/labels for the batch. """ start_idx = self.generator.batch_size * batch_num end_idx = start_idx + self.generator.batch_size if start_idx >= self.data_size: raise IndexError("Mapper: batch_num larger than length of data") # print("Fetching {} batch {} [{}]".format(self.name, batch_num, start_idx)) # Get head nodes head_ids = self.ids[start_idx:end_idx] # Get targets for nodes if self.targets is None: batch_targets = None else: batch_targets = self.targets[start_idx:end_idx] # Get sampled nodes batch_feats = self.generator.sample_features(head_ids, self._sampling_schema) return batch_feats, batch_targets class GraphSAGELinkGenerator: """A data generator for link prediction with Homogeneous GraphSAGE models At minimum, supply the StellarGraph, the batch size, and the number of node samples for each layer of the GraphSAGE model. The supplied graph should be a StellarGraph object that is ready for machine learning. Currently the model requires node features for all nodes in the graph. Use the :meth:`.flow` method supplying the nodes and (optionally) targets to get an object that can be used as a Keras data generator. Example:: G_generator = GraphSageLinkGenerator(G, 50, [10,10]) train_data_gen = G_generator.flow(edge_ids) Args: g (StellarGraph): A machine-learning ready graph. batch_size (int): Size of batch of links to return. num_samples (list): List of number of neighbour node samples per GraphSAGE layer (hop) to take. seed (int or str), optional: Random seed for the sampling methods. name, optional: Name of generator """ def __init__(self, G, batch_size, num_samples, seed=None, name=None): if not isinstance(G, StellarGraphBase): raise TypeError("Graph must be a StellarGraph object.") G.check_graph_for_ml(features=True) self.graph = G self.num_samples = num_samples self.batch_size = batch_size self.name = name # The sampler used to generate random samples of neighbours self.sampler = SampledBreadthFirstWalk(G) # We need a schema for compatibility with HinSAGE self.schema = G.create_graph_schema(create_type_maps=True) def sample_features(self, head_links, sampling_schema): """ Sample neighbours recursively from the head nodes, collect the features of the sampled nodes, and return these as a list of feature arrays for the GraphSAGE algorithm. Args: head_links: An iterable of edges to perform sampling for. sampling_schema: The sampling schema for the model Returns: A list of the same length as ``num_samples`` of collected features from the sampled nodes of shape: ``(len(head_nodes), num_sampled_at_layer, feature_size)`` where num_sampled_at_layer is the cumulative product of `num_samples` for that layer. """ node_type = sampling_schema[0][0][0] head_size = len(head_links) # Get sampled nodes for the subgraphs for the edges where each edge is a tuple # of 2 nodes, so we are extracting 2 head nodes per edge batch_feats = [] for hns in zip(*head_links): node_samples = self.sampler.run(nodes=hns, n=1, n_size=self.num_samples) # Reshape node samples to sensible format def get_levels(loc, lsize, samples_per_hop, walks): end_loc = loc + lsize walks_at_level = list(it.chain(*[w[loc:end_loc] for w in walks])) if len(samples_per_hop) < 1: return [walks_at_level] return [walks_at_level] + get_levels( end_loc, lsize * samples_per_hop[0], samples_per_hop[1:], walks ) nodes_per_hop = get_levels(0, 1, self.num_samples, node_samples) # Get features for the sampled nodes batch_feats.append( [ self.graph.get_feature_for_nodes(layer_nodes, node_type) for layer_nodes in nodes_per_hop ] ) # Resize features to (batch_size, n_neighbours, feature_size) # and re-pack features into a list where source, target feats alternate # This matches the GraphSAGE link model with (node_src, node_dst) input sockets: batch_feats = [ np.reshape(feats, (head_size, -1, feats.shape[1])) for ab in zip(*batch_feats) for feats in ab ] return batch_feats def flow(self, link_ids, targets=None): """ Creates a generator/sequence object for training or evaluation with the supplied edge IDs and numeric targets. The edge IDs are the edges to train or inference on. They are expected to by tuples of (source_id, destination_id). The targets are an array of numeric targets corresponding to the supplied link_ids to be used by the downstream task. They should be given in the same order as the list of link IDs. If they are not specified (for example, for use in prediction), the targets will not be available to the downsteam task. Args: link_ids: an iterable of (src_id, dst_id) tuples specifying the edges. targets: a 2D array of numeric targets with shape `(len(link_ids), target_size)` Returns: A LinkSequence object to use with the GraphSAGE model methods :meth:`fit_generator`, :meth:`evaluate_generator`, and :meth:`predict_generator` """ return LinkSequence(self, link_ids, targets) class HinSAGELinkGenerator: """A data generator for link prediction with Heterogeneous HinSAGE models At minimum, supply the StellarGraph, the batch size, and the number of node samples for each layer of the GraphSAGE model. The supplied graph should be a StellarGraph object that is ready for machine learning. Currently the model requires node features for all nodes in the graph. Use the :meth:`flow` method supplying the nodes and (optionally) targets to get an object that can be used as a Keras data generator. Example: ``` G_generator = HinSAGELinkGenerator(G, 50, [10,10]) data_gen = G_generator.flow(edge_ids) ``` Notes: We don't need to pass link_type (target link type) to the link mapper, considering that: 1. The mapper actually only cares about (src,dst) node types, and these can be inferred from the passed link ids (although this might be expensive, as it requires parsing the links ids passed - yet only once) 2. It's possible to do link prediction on a graph where that link type is completely removed from the graph (e.g., "same_as" links in ER) Args: g (StellarGraph): A machine-learning ready graph. batch_size (int): Size of batch of links to return. num_samples (list): List of number of neighbour node samples per GraphSAGE layer (hop) to take. seed (int or str), optional: Random seed for the sampling methods. name., optional: Name of generator """ def __init__(self, G, batch_size, num_samples, seed=None, name=None): if not isinstance(G, StellarGraphBase): raise TypeError("Graph must be a StellarGraph object.") G.check_graph_for_ml(features=True) self.graph = G self.num_samples = num_samples self.batch_size = batch_size self.name = name # We need a schema for compatibility with HinSAGE self.schema = G.create_graph_schema(create_type_maps=True) # The sampler used to generate random samples of neighbours self.sampler = SampledHeterogeneousBreadthFirstWalk( G, graph_schema=self.schema, seed=seed ) def _get_features(self, node_samples, head_size): """ Collect features from sampled nodes. Args: node_samples: A list of lists of node IDs head_size: The number of head nodes (typically the batch size). Returns: A list of numpy arrays that store the features for each head node. """ # Note the if there are no samples for a node a zero array is returned. # Resize features to (batch_size, n_neighbours, feature_size) # for each node type (note that we can have different feature size for each node type) batch_feats = [ self.graph.get_feature_for_nodes(layer_nodes, nt) for nt, layer_nodes in node_samples ] # Resize features to (batch_size, n_neighbours, feature_size) batch_feats = [np.reshape(a, (head_size, -1, a.shape[1])) for a in batch_feats] return batch_feats def sample_features(self, head_links, sampling_schema): """ Sample neighbours recursively from the head nodes, collect the features of the sampled nodes, and return these as a list of feature arrays for the GraphSAGE algorithm. Args: head_links: An iterable of edges to perform sampling for. sampling_schema: The sampling schema for the model Returns: A list of the same length as `num_samples` of collected features from the sampled nodes of shape: `(len(head_nodes), num_sampled_at_layer, feature_size)` where num_sampled_at_layer is the cumulative product of `num_samples` for that layer. """ nodes_by_type = [] for ii in range(2): # Extract head nodes from edges: each edge is a tuple of 2 nodes, so we are extracting 2 head nodes per edge head_nodes = [e[ii] for e in head_links] # Get sampled nodes for the subgraphs starting from the (src, dst) head nodes # nodes_samples is list of two lists: [[samples for src], [samples for dst]] node_samples = self.sampler.run( nodes=head_nodes, n=1, n_size=self.num_samples ) # Reshape node samples to the required format for the HinSAGE model # This requires grouping the sampled nodes by edge type and in order nodes_by_type.append( [ ( nt, reduce( operator.concat, (samples[ks] for samples in node_samples for ks in indices), [], ), ) for nt, indices in sampling_schema[ii] ] ) # Interlace the two lists, nodes_by_type[0] (for src head nodes) and nodes_by_type[1] (for dst head nodes) nodes_by_type = [ tuple((ab[0][0], reduce(operator.concat, (ab[0][1], ab[1][1])))) for ab in zip(nodes_by_type[0], nodes_by_type[1]) ] batch_feats = self._get_features(nodes_by_type, len(head_links)) return batch_feats def flow(self, link_ids, targets=None): """ Creates a generator/sequence object for training or evaluation with the supplied edge IDs and numeric targets. The edge IDs are the edges to train or inference on. They are expected to by tuples of (source_id, destination_id). The targets are an array of numeric targets corresponding to the supplied link_ids to be used by the downstream task. They should be given in the same order as the list of link IDs. If they are not specified (for example, for use in prediction), the targets will not be available to the downsteam task. Args: link_ids: an iterable of (src_id, dst_id) tuples specifying the edges. targets: a 2D array of numeric targets with shape ``(len(link_ids), target_size)`` Returns: A LinkSequence object to use with the GraphSAGE model methods :meth:`fit_generator`, :meth:`evaluate_generator`, and :meth:`predict_generator` """ return LinkSequence(self, link_ids, targets) PK!"A==#stellargraph/mapper/node_mappers.py# -*- coding: utf-8 -*- # # Copyright 2018 Data61, CSIRO # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Mappers to provide input data for the graph models in layers. """ __all__ = ["NodeSequence", "GraphSAGENodeGenerator", "HinSAGENodeGenerator"] import operator from functools import reduce import numpy as np import itertools as it from keras.utils import Sequence from ..data.explorer import ( SampledBreadthFirstWalk, SampledHeterogeneousBreadthFirstWalk, ) from ..core.graph import StellarGraphBase from ..core.utils import is_real_iterable class NodeSequence(Sequence): """Keras-compatible data generator to use with the Keras methods :meth:`keras.Model.fit_generator`, :meth:`keras.Model.evaluate_generator`, and :meth:`keras.Model.predict_generator`. This class generated data samples for node inference models and should be created using the `.flow(...)` method of :class:`GraphSAGENodeGenerator` or :class:`HinSAGENodeGenerator`. These Generators are classes that capture the graph structure and the feature vectors of each node. These generator classes are used within the NodeSequence to generate samples of k-hop neighbourhoods in the graph and to return to this class the features from the sampled neighbourhoods. Args: generator: GraphSAGENodeGenerator or HinSAGENodeGenerator The generator object containing the graph information. ids: list A list of the node_ids to be used as head-nodes in the downstream task. targets: list, optional (default=None) A list of targets or labels to be used in the downstream class. """ def __init__(self, generator, ids, targets=None): # Check that ids is an iterable if not is_real_iterable(ids): raise TypeError("IDs must be an iterable or numpy array of graph node IDs") # Check targets is iterable & has the correct length if targets is not None: if not is_real_iterable(targets): raise TypeError("Targets must be None or an iterable or numpy array ") if len(ids) != len(targets): raise ValueError( "The length of the targets must be the same as the length of the ids" ) # Infer head_node_type # TODO: Generalize to multiple head node target types? head_node_types = set([generator.schema.get_node_type(n) for n in ids]) if len(head_node_types) > 1: raise ValueError( "Only a single head node type is currently supported for HinSAGE models" ) head_node_type = head_node_types.pop() # Store the generator to draw samples from graph self.generator = generator self.ids = list(ids) self.targets = targets self.data_size = len(self.ids) # Save head node type and generate sampling schema self.head_node_types = [head_node_type] self._sampling_schema = generator.schema.sampling_layout( self.head_node_types, generator.num_samples ) def __len__(self): "Denotes the number of batches per epoch" return int(np.ceil(self.data_size / self.generator.batch_size)) def __getitem__(self, batch_num): """ Generate one batch of data Args: batch_num (int): number of a batch Returns: batch_feats (list): Node features for nodes and neighbours sampled from a batch of the supplied IDs batch_targets (list): Targets/labels for the batch. """ start_idx = self.generator.batch_size * batch_num end_idx = start_idx + self.generator.batch_size if start_idx >= self.data_size: raise IndexError("Mapper: batch_num larger than length of data") # print("Fetching batch {} [{}]".format(batch_num, start_idx)) # Get head nodes head_ids = self.ids[start_idx:end_idx] # Get targets for nodes if self.targets is None: batch_targets = None else: batch_targets = self.targets[start_idx:end_idx] # Get sampled nodes batch_feats = self.generator.sample_features(head_ids, self._sampling_schema) return batch_feats, batch_targets class GraphSAGENodeGenerator: """A data generator for node prediction with Homogeneous GraphSAGE models At minimum, supply the StellarGraph, the batch size, and the number of node samples for each layer of the GraphSAGE model. The supplied graph should be a StellarGraph object that is ready for machine learning. Currently the model requires node features for all nodes in the graph. Use the :meth:`flow` method supplying the nodes and (optionally) targets to get an object that can be used as a Keras data generator. Example:: G_generator = GraphSAGENodeGenerator(G, 50, [10,10]) train_data_gen = G_generator.flow(node_ids) Args: G (StellarGraph): The machine-learning ready graph. batch_size (int): Size of batch to return. num_samples (list): The number of samples per layer (hop) to take. seed (int): [Optional] Random seed for the node sampler. name (str or None): Name of the generator (optional) """ def __init__(self, G, batch_size, num_samples, seed=None, name=None): if not isinstance(G, StellarGraphBase): raise TypeError("Graph must be a StellarGraph object.") self.graph = G self.num_samples = num_samples self.batch_size = batch_size self.name = name # Check if the graph has features G.check_graph_for_ml() # Create sampler for GraphSAGE self.sampler = SampledBreadthFirstWalk(G, seed=seed) # We need a schema for compatibility with HinSAGE self.schema = G.create_graph_schema(create_type_maps=True) # Check that there is only a single node type for GraphSAGE if len(self.schema.node_types) > 1: print( "Warning: running homogeneous GraphSAGE on a graph with multiple node types" ) def sample_features(self, head_nodes, sampling_schema): """ Sample neighbours recursively from the head nodes, collect the features of the sampled nodes, and return these as a list of feature arrays for the GraphSAGE algorithm. Args: head_nodes: An iterable of head nodes to perform sampling on. sampling_schema: The sampling schema for the model Returns: A list of the same length as ``num_samples`` of collected features from the sampled nodes of shape: ``(len(head_nodes), num_sampled_at_layer, feature_size)`` where num_sampled_at_layer is the cumulative product of `num_samples` for that layer. """ node_samples = self.sampler.run(nodes=head_nodes, n=1, n_size=self.num_samples) # Reshape node samples to sensible format def get_levels(loc, lsize, samples_per_hop, walks): end_loc = loc + lsize walks_at_level = list(it.chain(*[w[loc:end_loc] for w in walks])) if len(samples_per_hop) < 1: return [walks_at_level] return [walks_at_level] + get_levels( end_loc, lsize * samples_per_hop[0], samples_per_hop[1:], walks ) nodes_per_hop = get_levels(0, 1, self.num_samples, node_samples) node_type = sampling_schema[0][0][0] # Get features for sampled nodes batch_feats = [ self.graph.get_feature_for_nodes(layer_nodes, node_type) for layer_nodes in nodes_per_hop ] # Resize features to (batch_size, n_neighbours, feature_size) batch_feats = [ np.reshape(a, (len(head_nodes), -1 if np.size(a) > 0 else 0, a.shape[1])) for a in batch_feats ] return batch_feats def flow(self, node_ids, targets=None): """ Creates a generator/sequence object for training or evaluation with the supplied node ids and numeric targets. The node IDs are the nodes to train or inference on: the embeddings calculated for these nodes are passed to the downstream task. These are a subset of the nodes in the graph. The targets are an array of numeric targets corresponding to the supplied node_ids to be used by the downstream task. They should be given in the same order as the list of node IDs. If they are not specified (for example, for use in prediction), the targets will not be available to the downsteam task. Args: node_ids: an iterable of node IDs targets: a 2D array of numeric targets with shape `(len(node_ids), target_size)` Returns: A NodeSequence object to use with the GraphSAGE model in Keras methods ``fit_generator``, ``evaluate_generator``, and ``predict_generator`` """ return NodeSequence(self, node_ids, targets) def flow_from_dataframe(self, node_targets): """ Creates a generator/sequence object for training or evaluation with the supplied node ids and numeric targets. Args: node_targets: a Pandas DataFrame of numeric targets indexed by the node ID for that target. Returns: A NodeSequence object to use with the GraphSAGE model in Keras methods ``fit_generator``, ``evaluate_generator``, and ``predict_generator`` """ return NodeSequence(self, node_targets.index, node_targets.values) class HinSAGENodeGenerator: """Keras-compatible data mapper for Heterogeneour GraphSAGE (HinSAGE) At minimum, supply the StellarGraph, the batch size, and the number of node samples for each layer of the HinSAGE model. The supplied graph should be a StellarGraph object that is ready for machine learning. Currently the model requires node features for all nodes in the graph. Use the :meth:`flow` method supplying the nodes and (optionally) targets to get an object that can be used as a Keras data generator. Example:: G_generator = HinSAGENodeGenerator(G, 50, [10,10]) data_gen = G_generator.flow(node_ids) Args: G (StellarGraph): The machine-learning ready graph batch_size (int): Size of batch to return num_samples (list): The number of samples per layer (hop) to take seed (int), Optional: Random seed for the node sampler name (str), optional: Name of the generator. """ def __init__(self, G, batch_size, num_samples, seed=None, name=None): self.graph = G self.num_samples = num_samples self.batch_size = batch_size self.name = name # We require a StellarGraph if not isinstance(G, StellarGraphBase): raise TypeError("Graph must be a StellarGraph object.") G.check_graph_for_ml(features=True) # Create sampler for GraphSAGE self.sampler = SampledHeterogeneousBreadthFirstWalk(G, seed=seed) # Generate schema self.schema = G.create_graph_schema(create_type_maps=True) def sample_features(self, head_nodes, sampling_schema): """ Sample neighbours recursively from the head nodes, collect the features of the sampled nodes, and return these as a list of feature arrays for the GraphSAGE algorithm. Args: head_nodes: An iterable of head nodes to perform sampling on. node_sampling_schema: The sampling schema for the HinSAGE model, this is can be generated by the ``GraphSchema`` object. Returns: A list of the same length as ``num_samples`` of collected features from the sampled nodes of shape: ``(len(head_nodes), num_sampled_at_layer, feature_size)`` where num_sampled_at_layer is the cumulative product of `num_samples` for that layer. """ # Get sampled nodes node_samples = self.sampler.run(nodes=head_nodes, n=1, n_size=self.num_samples) # Reshape node samples to the required format for the HinSAGE model # This requires grouping the sampled nodes by edge type and in order nodes_by_type = [ ( nt, reduce( operator.concat, (samples[ks] for samples in node_samples for ks in indices), ), ) for nt, indices in sampling_schema[0] ] # Get features batch_feats = [ self.graph.get_feature_for_nodes(layer_nodes, nt) for nt, layer_nodes in nodes_by_type ] # Resize features to (batch_size, n_neighbours, feature_size) batch_feats = [ np.reshape(a, (len(head_nodes), -1 if np.size(a) > 0 else 0, a.shape[1])) for a in batch_feats ] return batch_feats def flow(self, node_ids, targets=None): """ Creates a generator/sequence object for training or evaluation with the supplied node ids and numeric targets. The node IDs are the nodes to train or inference on: the embeddings calculated for these nodes are passed to the downstream task. These are a subset of the nodes in the graph. The targets are an array of numeric targets corresponding to the supplied node_ids to be used by the downstream task. They should be given in the same order as the list of node IDs. If they are not specified (for example, for use in prediction), the targets will not be available to the downsteam task. Args: node_ids (iterable): The head node IDs targets (Numpy array): a 2D array of numeric targets with shape ``(len(node_ids), target_size)`` node_type (str), optional: The target node type, if not given the node type will be inferred from the graph. Returns: A NodeSequence object to use with the GraphSAGE model in Keras methods `fit_generator`, `evaluate_generator`, and `predict_generator`. """ return NodeSequence(self, node_ids, targets) def flow_from_dataframe(self, node_targets): """ Creates a generator/sequence object for training or evaluation with the supplied node ids and numeric targets. Args: node_targets (DataFrame): Numeric targets indexed by the node ID for that target. node_type (str), optional: The target node type, if not given the node type will be inferred from the graph. Returns: A NodeSequence object to use with the GraphSAGE model in Keras methods `fit_generator`, `evaluate_generator`, and `predict_generator`. """ return NodeSequence(self, node_targets.index, node_targets.values) PK!x,,$stellargraph-0.4.0.dist-info/LICENSE Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [2018] [Commonwealth Scientific and Industrial Research Organisation (CSIRO)] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. PK!HlŃTT"stellargraph-0.4.0.dist-info/WHEEL A н#J@Z|Jmqvh&#hڭw!Ѭ"J˫( } %PK!H`ٸF+%stellargraph-0.4.0.dist-info/METADATAZrO8ӉK%(c܆]D7;Ƌ,L3|+r~J4?XEq5S<{]qjĮV.) 爗g4B&Ih;Dg88y'4{t8 F}^(yXwaL<񵍦W쭋2YzWK߆x ,Ҭ|3O{$BY2xޗ vHsi㑜%OPO^x=lɖ,dџD<{4ȤU$]ܖis)P2Ox82<07{` x€X<"LCىt;i^Iլr6񛅭kυwIEdU&`W7/c0)d"E38o$T)wt y)uqeɱx>=<gd:< G?d('87gĩw5m=c˦QYkkIU!C:ypaaD>6SD}~)cka-8sMcNd3%  qP.Bù1.$)^1@mҙ2W ̓& _.E\<'֣vd,RT'0-c) kg(0"AALH|VPDs!ŒYCp }@YEYTMhџ}="9)F}6MJ\&HH٬.vQA%wnNklrlt+zR:bWf ٙV`hpj\f'LT+*N6}!r.#R|_9˪:0ZU 5LV-oSNcgаAIkwU3tֈxB{H֜BIGۛp(| Pt#FcB$4B/$ħL9M Ph0Ҧ0؀%b.yt]gV۹}:I/:Ƃ͖ iPlS/&bɪ%e:Uxm \+=K6> c)jX\ vAc 揩@φ | h(P *iWztƻ8TkeYl8țڻ@~e^xm{!z^}gGYl\@rR)u,]Ң3(dh[?Mj=& 0]s'Է=W`oVOk 9cMBx%vZJ|^U5Z9% ՓŸ0f`;Odgv'jN544enoݾ6ݴH!^% d&yƒ;H9 l;y"s'kyR]R:T"4Q[rq*S?YD 4C#Yƒ8qD*Mh( ¹ w V~)jSd3Gp|cw0&uu2ˋƿmoK@hF'ҹV+Qm/ANW[Pǽ'2+hL#SOh\2@c|SIUh%`͈JnF:8At>Pû&?(CUm e TB 5a}fة fوH3ztN9Zm5NEn/)jȻpzI_zilU2%5j55SM`HEF2]ФooERcl2RPyKQ(lJa10ZiaF+(. -maM6ke Pfe{ݰaM3?}++,HS%מ_]}ygagTf65!?c7ZL"l2mV!ۍY~mq-jSm'^Ḡ@/Mʁq'N6~=9um1m} ᑞNzjVk,P Lf8IarHhe`%Fp2sڥ R90ia8R2}.<}h漶kvVJ0ؑ.Kf7wCT-De8AYY_N`s'ԻA3igLo sH~gʦ}}խe(*}s~Mayw#`\C6dR\0Mp31GzTmMdkyh )Mbu| YcBoi{i~:_3z 4 )v;|AVEyErC:-1 KAڮfcX1@U)fP0]L *+Ħm#*n-1HB`0|Na% Zh^!aBn~>Ͽ=8:~MByW͗Ve5 YPK!H=(d#stellargraph-0.4.0.dist-info/RECORD}ɖX}= f1^ 2+3(pήcfqzVU}?lJ%ȿ: /ҹZm=׃9 hY%^ ) J ?w/nt gt$^:U21Zc2׃43DY;stT$ɡ̳ a\T) B w(vJ"9EZoօ貵a1-~5SV U}Nu !ǣjx3G 5r׬'JR$+=Ջ\10ULBnIj]2E;5! `TY>fXFxh@㶙:I6A -W[L#3r!f-7M4_sG z E cDSlR%2$Am]^ʭKe]zAA>wܞwx!,dU4mt뫽w:%~eP-PrU]}(ޓH=b0b j z8G! y*K@|ld3Yn,|(R)ǢRl7x+w a0쫝WmVSN*h13A >Ɏ拢$i !f[C{m̈́[,\@mw񭑲C;*Y`5+11':*:bPamg|_`nOar̾5*юCb&"x!!{ϿϦɸL. &wRE?w ;eB(qNQn3IX ?@ {c5~H^i?Uy/J_bdͮ] {tH2ꬊVn!ՒŽEI!{ m0) NaIrDY򆙡M6CVXlgm}|]?˭j'&fh߲.M3t<~1㟾o'PK!`\stellargraph/__init__.pyPK!·lK1stellargraph/core/__init__.pyPK!TDfDf<stellargraph/core/graph.pyPK!+'00jstellargraph/core/schema.pyPK!"ћstellargraph/core/utils.pyPK!nnstellargraph/data/__init__.pyPK!QQ;stellargraph/data/converter.pyPK!~"_stellargraph/data/edge_splitter.pyPK!IWIWstellargraph/data/epgm.pyPK!yww0 stellargraph/data/explorer.pyPK!N""stellargraph/data/loader.pyPK!MMM"=stellargraph/data/node_splitter.pyPK!Z stellargraph/globalvar.pyPK!:>OAhhI stellargraph/layer/__init__.pyPK!%** stellargraph/layer/graphsage.pyPK!6@~};};8stellargraph/layer/hinsage.pyPK!Ԉ~'$'$$tstellargraph/layer/link_inference.pyPK![*==stellargraph/mapper/__init__.pyPK!DD#sstellargraph/mapper/link_mappers.pyPK!"A==#stellargraph/mapper/node_mappers.pyPK!x,,$stellargraph-0.4.0.dist-info/LICENSEPK!HlŃTT"ZLstellargraph-0.4.0.dist-info/WHEELPK!H`ٸF+%Lstellargraph-0.4.0.dist-info/METADATAPK!H=(d#[stellargraph-0.4.0.dist-info/RECORDPK.`