schematic.schemas.data_model_graph

DataModel Graph

  1"""DataModel Graph"""
  2
  3import logging
  4from typing import Any, Optional, Union, AbstractSet
  5
  6import graphviz  # type: ignore
  7import networkx as nx  # type: ignore
  8from opentelemetry import trace
  9
 10from schematic.schemas.data_model_parser import DataModelParser
 11from schematic.schemas.data_model_edges import DataModelEdges
 12from schematic.schemas.data_model_nodes import DataModelNodes
 13from schematic.schemas.data_model_relationships import DataModelRelationships
 14from schematic.schemas.constants import JSONSchemaType
 15from schematic.utils.general import unlist
 16from schematic.utils.schema_utils import (
 17    DisplayLabelType,
 18    extract_component_validation_rules,
 19    get_class_label_from_display_name,
 20    get_property_label_from_display_name,
 21)
 22from schematic.utils.validate_utils import rule_in_rule_list
 23from schematic.utils.viz_utils import visualize
 24
 25logger = logging.getLogger(__name__)
 26
 27
 28logger = logging.getLogger(__name__)
 29tracer = trace.get_tracer("Schematic")
 30
 31
 32class DataModelGraphMeta:  # pylint: disable=too-few-public-methods
 33    """DataModelGraphMeta"""
 34
 35    _instances: dict = {}
 36
 37    def __call__(  # pylint: disable=no-self-argument
 38        cls, *args: Any, **kwargs: Any
 39    ) -> Any:
 40        """
 41        Possible changes to the value of the `__init__` argument do not affect
 42        the returned instance.
 43        """
 44        if cls not in cls._instances:
 45            instance = super().__call__(*args, **kwargs)  # type: ignore # pylint: disable=no-member
 46            cls._instances[cls] = instance
 47        return cls._instances[cls]
 48
 49
 50class DataModelGraph:  # pylint: disable=too-few-public-methods
 51    """
 52    Generate graph network (networkx) from the attributes and relationships returned
 53    from the data model parser.
 54
 55    Create a singleton.
 56    """
 57
 58    __metaclass__ = DataModelGraphMeta
 59
 60    def __init__(
 61        self,
 62        attribute_relationships_dict: dict,
 63        data_model_labels: DisplayLabelType = "class_label",
 64    ) -> None:
 65        """Load parsed data model.
 66        Args:
 67            attributes_relationship_dict, dict: generated in data_model_parser
 68                {Attribute Display Name: {
 69                        Relationships: {
 70                                    CSV Header: Value}}}
 71            data_model_labels: str, display_label or class_label.
 72                display_label, use the display name as a label, if it is valid
 73                (contains no blacklisted characters) otherwise will default to schema_label.
 74                class_label, default, use standard class or property label.
 75        Raises:
 76            ValueError, attribute_relationship_dict not loaded.
 77        """
 78        self.attribute_relationships_dict = attribute_relationships_dict
 79        self.dmn = DataModelNodes(self.attribute_relationships_dict)
 80        self.dme = DataModelEdges()
 81        self.dmr = DataModelRelationships()
 82        self.data_model_labels = data_model_labels
 83
 84        if not self.attribute_relationships_dict:
 85            raise ValueError(
 86                (
 87                    "Something has gone wrong, a data model was not loaded into the DataModelGraph "
 88                    "Class. Please check that your paths are correct"
 89                )
 90            )
 91        self.graph = self.generate_data_model_graph()
 92
 93    @tracer.start_as_current_span("DataModelGraph::generate_data_model_graph")
 94    def generate_data_model_graph(self) -> nx.MultiDiGraph:
 95        """
 96        Generate NetworkX Graph from the Relationships/attributes dictionary, the graph is built
 97          by first adding all nodes to the graph, then connecting nodes by the relationships defined
 98          in the attributes_relationship dictionary.
 99        Returns:
100            G: nx.MultiDiGraph, networkx graph representation of the data model
101        """
102        # Get all relationships with edges
103        edge_relationships = self.dmr.retrieve_rel_headers_dict(edge=True)
104
105        # Find all nodes
106        all_nodes = self.dmn.gather_all_nodes_in_model(
107            attr_rel_dict=self.attribute_relationships_dict
108        )
109
110        # Instantiate NetworkX MultiDigraph
111        graph: nx.MultiDiGraph = nx.MultiDiGraph()
112
113        all_node_dict = {}
114
115        ## Fill in MultiDigraph with nodes
116        for node in all_nodes:
117            # Gather information for each node
118            node_dict = self.dmn.generate_node_dict(
119                node_display_name=node,
120                attr_rel_dict=self.attribute_relationships_dict,
121                data_model_labels=self.data_model_labels,
122            )
123
124            # Add each node to the all_node_dict to be used for generating edges
125            all_node_dict[node] = node_dict
126
127            # Generate node and attach information (attributes) to each node
128            graph = self.dmn.generate_node(graph, node_dict)
129
130        edge_list: list[tuple[str, str, dict[str, Union[str, int]]]] = []
131        ## Connect nodes via edges
132        for node in all_nodes:
133            # Generate edges
134            edge_list_2 = self.dme.generate_edge(
135                node,
136                all_node_dict,
137                self.attribute_relationships_dict,
138                edge_relationships,
139                edge_list,
140            )
141            edge_list = edge_list_2.copy()
142
143        # Add edges to the Graph
144        for node_1, node_2, edge_dict in edge_list:
145            graph.add_edge(
146                node_1, node_2, key=edge_dict["key"], weight=edge_dict["weight"]
147            )
148        return graph
149
150
151class DataModelGraphExplorer:  # pylint: disable=too-many-public-methods
152    """DataModelGraphExplorer"""
153
154    def __init__(
155        self,
156        graph: nx.MultiDiGraph,
157    ):
158        """Load data model graph as a singleton.
159        Args:
160            G: nx.MultiDiGraph, networkx graph representation of the data model
161        """
162        self.graph = graph  # At this point the graph is expected to be fully formed.
163        self.dmr = DataModelRelationships()
164
165    def find_properties(self) -> set[str]:
166        """
167        Identify all properties, as defined by the first node in a pair, connected with
168        'domainIncludes' edge type
169
170        Returns:
171            properties, set: All properties defined in the data model, each property name
172              is defined by its label.
173        """
174        properties_list: list[str] = []
175        for node_1, _, rel in self.graph.edges:
176            if rel == self.dmr.get_relationship_value("domainIncludes", "edge_key"):
177                properties_list.append(node_1)
178        properties_set = set(properties_list)
179        return properties_set
180
181    def find_classes(self) -> AbstractSet[str]:
182        """
183        Identify all classes, as defined but all nodes, minus all properties
184        (which are explicitly defined)
185        Returns:
186            classes, set:  All classes defined in the data model, each class
187              name is defined by its label.
188        """
189        nodes = self.graph.nodes
190        properties = self.find_properties()
191        classes = nodes - properties
192        return classes
193
194    def find_node_range(
195        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
196    ) -> list:
197        """Get valid values for the given node (attribute)
198        Args:
199            node_label, str, Optional[str]: label of the node for which to retrieve valid values
200            node_display_name, str, Optional[str]: Display Name of the node for which to
201              retrieve valid values
202        Returns:
203            valid_values, list: List of valid values associated with the provided node.
204        """
205        node_label = self._get_node_label(node_label, node_display_name)
206
207        valid_values = []
208        for node_1, node_2, rel in self.graph.edges:
209            if node_1 == node_label and rel == self.dmr.get_relationship_value(
210                "rangeIncludes", "edge_key"
211            ):
212                valid_values.append(node_2)
213        valid_values = list(set(valid_values))
214        return valid_values
215
216    def get_adjacent_nodes_by_relationship(
217        self, node_label: str, relationship: str
218    ) -> list[str]:
219        """Get a list of nodes that is / are adjacent to a given node, based on a relationship type.
220
221        Args:
222            node_label: label of the the node whose edges we need to look at.
223            relationship: the type of link(s) that the above node and its immediate neighbors share.
224
225        Returns:
226            List of nodes that are adjacent to the given node.
227        #checked
228        """
229        nodes = set()
230        for _, node_2, key, _ in self.graph.out_edges(node_label, data=True, keys=True):
231            if key == relationship:
232                nodes.add(node_2)
233
234        return list(nodes)
235
236    def get_component_node_required(
237        self,
238        manifest_component: str,
239        node_validation_rules: Optional[list[str]] = None,
240        node_label: Optional[str] = None,
241        node_display_name: Optional[str] = None,
242    ) -> bool:
243        """Check if a node is required taking into account the manifest component it is defined in
244        (requirements can be set in validation rule as well as required column)
245        Args:
246            manifest_component: str, manifest component display name that the node belongs to.
247            node_validation_rules: list[str], validation rules for a given node and component.
248            node_label: str, Label of the node you would want to get the comment for.
249            node_display_name: str, node display name for the node being queried.
250        Returns:
251            True, if node is required, False if not
252        """
253        node_required = False
254
255        if not node_validation_rules:
256            # Get node validation rules for a given component
257            node_validation_rules = self.get_component_node_validation_rules(
258                manifest_component=manifest_component,
259                node_label=node_label,
260                node_display_name=node_display_name,
261            )
262
263        # Check if the validation rule specifies that the node is required for this particular
264        # component.
265        if rule_in_rule_list("required", node_validation_rules):
266            node_required = True
267            # To prevent any unintended errors, ensure the Required field for this node is False
268            if self.get_node_required(
269                node_label=node_label, node_display_name=node_display_name
270            ):
271                if not node_display_name:
272                    assert node_label is not None
273                    node_display_name = self.graph.nodes[node_label][
274                        self.dmr.get_relationship_value("displayName", "node_label")
275                    ]
276                error_str = " ".join(
277                    [
278                        f"For component: {manifest_component} and attribute: {node_display_name}",
279                        "requirements are being specified in both the Required field and in the",
280                        "Validation Rules. If you desire to use validation rules to set component",
281                        "specific requirements for this attribute",
282                        "then the Required field needs to be set to False, or the validation may",
283                        "not work as intended, for other components where the attribute",
284                        "that should not be required.",
285                    ]
286                )
287
288                logger.error(error_str)
289        else:
290            # If requirements are not being set in the validation rule, then just pull the
291            # standard node requirements from the model
292            node_required = self.get_node_required(
293                node_label=node_label, node_display_name=node_display_name
294            )
295        return node_required
296
297    def get_component_node_validation_rules(
298        self,
299        manifest_component: str,
300        node_label: Optional[str] = None,
301        node_display_name: Optional[str] = None,
302    ) -> list:
303        """Get validation rules for a given node and component.
304        Args:
305            manifest_component: str, manifest component display name that the node belongs to.
306            node_label: str, Label of the node you would want to get the comment for.
307            node_display_name: str, node display name for the node being queried.
308        Returns:
309            validation_rules: list, validation rules list for a given node and component.
310        """
311        # get any additional validation rules associated with this node (e.g. can this node
312        # be mapped to a list of other nodes)
313        node_validation_rules = self.get_node_validation_rules(
314            node_label=node_label, node_display_name=node_display_name
315        )
316
317        # Parse the validation rules per component if applicable
318        if node_validation_rules and isinstance(node_validation_rules, dict):
319            node_validation_rules_list = extract_component_validation_rules(
320                manifest_component=manifest_component,
321                validation_rules_dict=node_validation_rules,  # type: ignore
322            )
323        else:
324            assert isinstance(node_validation_rules, list)
325            node_validation_rules_list = node_validation_rules
326        return node_validation_rules_list
327
328    def get_component_requirements(
329        self,
330        source_component: str,
331    ) -> list[str]:
332        """
333        Get all components that are associated with a given source component and are
334          required by it.
335
336        Args:
337            source_component: source component for which we need to find all required downstream
338              components.
339
340        Returns:
341            List of nodes that are descendants from the source component are are related to the
342              source through a specific component relationship.
343        """
344
345        req_components = list(
346            reversed(
347                self.get_descendants_by_edge_type(
348                    source_component,
349                    self.dmr.get_relationship_value("requiresComponent", "edge_key"),
350                    ordered=True,
351                )
352            )
353        )
354
355        return req_components
356
357    def get_component_requirements_graph(
358        self,
359        source_component: str,
360    ) -> nx.Graph:
361        """
362        Get all components that are associated with a given source component and are required by it;
363          return the components as a dependency graph (i.e. a DAG).
364
365        Args:
366            source_component, str: source component for which we need to find all required
367              downstream components.
368
369        Returns:
370            A subgraph of the schema graph induced on nodes that are descendants from the source
371              component and are related to the source through a specific component relationship.
372        """
373
374        # get a list of required component nodes
375        req_components = self.get_component_requirements(source_component)
376
377        # get the subgraph induced on required component nodes
378        req_components_graph = self.get_subgraph_by_edge_type(
379            self.dmr.get_relationship_value("requiresComponent", "edge_key"),
380        ).subgraph(req_components)
381
382        return req_components_graph
383
384    def get_descendants_by_edge_type(
385        self,
386        source_node: str,
387        relationship: str,
388        connected: bool = True,
389        ordered: bool = False,
390    ) -> list[str]:
391        """
392        Get all nodes that are descendants of a given source node, based on a specific
393          type of edge / relationship type.
394
395        Args:
396            source_node: The node whose descendants need to be retrieved.
397            relationship: Edge / link relationship type with possible values same as in above docs.
398            connected:
399              If True, we need to ensure that all descendant nodes are reachable from the source
400                node, i.e., they are part of the same connected component.
401              If False, the descendants could be in multiple connected components.
402              Default value is True.
403            ordered:
404              If True, the list of descendants will be topologically ordered.
405              If False, the list has no particular order (depends on the order in which the
406                descendants were traversed in the subgraph).
407
408        Returns:
409            List of nodes that are descendants from a particular node (sorted / unsorted)
410        """
411
412        root_descendants = nx.descendants(self.graph, source_node)
413
414        subgraph_nodes = list(root_descendants)
415        subgraph_nodes.append(source_node)
416        descendants_subgraph = self.graph.subgraph(subgraph_nodes)
417
418        # prune the descendants subgraph so as to include only those edges that match
419        # the relationship type
420        rel_edges = []
421        for node_1, node_2, key, _ in descendants_subgraph.edges(data=True, keys=True):
422            if key == relationship:
423                rel_edges.append((node_1, node_2))
424
425        relationship_subgraph: nx.DiGraph = nx.DiGraph()
426        relationship_subgraph.add_edges_from(rel_edges)
427
428        descendants = relationship_subgraph.nodes()
429
430        if not descendants:
431            # return empty list if there are no nodes that are reachable from the
432            # source node based on this relationship type
433            return []
434
435        if connected and ordered:
436            # get the set of reachable nodes from the source node
437            descendants = nx.descendants(relationship_subgraph, source_node)
438            descendants.add(source_node)
439
440            # normally, the descendants from a node are unordered (peculiarity
441            # of nx descendants call)
442            # form the subgraph on descendants and order it topologically
443            # this assumes an acyclic subgraph
444            descendants = nx.topological_sort(
445                relationship_subgraph.subgraph(descendants)
446            )
447        elif connected:
448            # get the nodes that are reachable from a given source node
449            # after the pruning process above some nodes in the
450            # root_descendants subgraph might have become disconnected and
451            # will be omitted
452            descendants = nx.descendants(relationship_subgraph, source_node)
453            descendants.add(source_node)
454        elif ordered:
455            # sort the nodes topologically
456            # this requires the graph to be an acyclic graph
457            descendants = nx.topological_sort(relationship_subgraph)
458
459        return list(descendants)
460
461    def get_digraph_by_edge_type(self, edge_type: str) -> nx.DiGraph:
462        """Get a networkx digraph of the nodes connected via a given edge_type.
463        Args:
464            edge_type:
465                Edge type to search for, possible types are defined by 'edge_key'
466                  in relationship class
467        Returns:
468        """
469
470        digraph: nx.DiGraph = nx.DiGraph()
471        for node_1, node_2, key, _ in self.graph.edges(data=True, keys=True):
472            if key == edge_type:
473                digraph.add_edge(node_1, node_2)
474        return digraph
475
476    def get_edges_by_relationship(
477        self,
478        node: str,
479        relationship: str,
480    ) -> list[tuple[str, str]]:
481        """Get a list of out-edges of a node where the edges match a specific type of relationship.
482
483        i.e., the edges connecting a node to its neighbors are of relationship type -- "parentOf"
484          (set of edges to children / sub-class nodes).
485
486        Args:
487            node: the node whose edges we need to look at.
488            relationship: the type of link(s) that the above node and its immediate neighbors share.
489
490        Returns:
491            List of edges that are connected to the node.
492        """
493        edges: list[tuple[str, str]] = []
494
495        for node_1, node_2, key, _ in self.graph.out_edges(node, data=True, keys=True):
496            if key == relationship:
497                edges.append((node_1, node_2))
498
499        return edges
500
501    def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]:
502        """
503        Order the values associated with a particular node and edge_key to
504          match original ordering in schema.
505
506        Args:
507            key (str): a key representing and edge relationship in
508              DataModelRelationships.relationships_dictionary
509            source_node_label (str): node to look for edges of and order
510
511        Raises:
512            KeyError: cannot find source node in graph
513
514        Returns:
515            list[str]:
516              list of sorted nodes, that share the specified relationship with the source node
517              For the example data model, for key='rangeIncludes', source_node_label='CancerType'
518                the return would be ['Breast, 'Colorectal', 'Lung', 'Prostate', 'Skin'] in that
519                exact order.
520        """
521        # Check if node is in the graph, if not throw an error.
522        if not self.is_class_in_schema(node_label=source_node_label):
523            raise KeyError(
524                f"Cannot find node: {source_node_label} in the graph, please check entry."
525            )
526
527        edge_key = self.dmr.get_relationship_value(key, "edge_key")
528
529        # Handle out edges
530        if self.dmr.get_relationship_value(key, "jsonld_direction") == "out":
531            # use out edges
532
533            original_edge_weights_dict = {
534                attached_node: self.graph[source_node][attached_node][edge_key][
535                    "weight"
536                ]
537                for source_node, attached_node in self.graph.out_edges(
538                    source_node_label
539                )
540                if edge_key in self.graph[source_node][attached_node]
541            }
542        # Handle in edges
543        else:
544            # use in edges
545            original_edge_weights_dict = {
546                attached_node: self.graph[attached_node][source_node][edge_key][
547                    "weight"
548                ]
549                for attached_node, source_node in self.graph.in_edges(source_node_label)
550                if edge_key in self.graph[attached_node][source_node]
551            }
552
553        sorted_nodes = list(
554            dict(
555                sorted(original_edge_weights_dict.items(), key=lambda item: item[1])
556            ).keys()
557        )
558
559        return sorted_nodes
560
561    # Get values associated with a node
562    def get_nodes_ancestors(self, subgraph: nx.DiGraph, node_label: str) -> list[str]:
563        """Get a list of nodes reachable from source component in graph
564
565        Args:
566            subgraph (nx.DiGraph): networkx graph object
567            node_label (str): label of node to find ancestors for
568
569        Returns:
570            list[str]: nodes reachable from source in graph
571        """
572        all_ancestors = list(nx.ancestors(subgraph, node_label))
573
574        return all_ancestors
575
576    def get_node_comment(
577        self, node_display_name: Optional[str] = None, node_label: Optional[str] = None
578    ) -> str:
579        """Get the node definition, i.e., the "comment" associated with a given node display name.
580
581        Args:
582            node_display_name, str: Display name of the node which you want to get the comment for.
583            node_label, str: Label of the node you would want to get the comment for.
584        Returns:
585            Comment associated with node, as a string.
586        """
587        node_label = self._get_node_label(node_label, node_display_name)
588
589        if not node_label:
590            return ""
591
592        node_definition = self.graph.nodes[node_label][
593            self.dmr.get_relationship_value("comment", "node_label")
594        ]
595        return node_definition
596
597    def get_node_dependencies(
598        self,
599        source_node: str,
600        display_names: bool = True,
601        schema_ordered: bool = True,
602    ) -> list[str]:
603        """Get the immediate dependencies that are related to a given source node.
604
605        Args:
606            source_node: The node whose dependencies we need to compute.
607            display_names: if True, return list of display names of each of the dependencies.
608                           if False, return list of node labels of each of the dependencies.
609            schema_ordered:
610              if True, return the dependencies of the node following the order of the schema
611                (slower).
612              if False, return dependencies from graph without guaranteeing schema order (faster)
613
614        Returns:
615            List of nodes that are dependent on the source node.
616        """
617
618        if schema_ordered:
619            # get dependencies in the same order in which they are defined in the schema
620            required_dependencies = self.get_ordered_entry(
621                key=self.dmr.get_relationship_value("requiresDependency", "edge_key"),
622                source_node_label=source_node,
623            )
624        else:
625            required_dependencies = self.get_adjacent_nodes_by_relationship(
626                node_label=source_node,
627                relationship=self.dmr.get_relationship_value(
628                    "requiresDependency", "edge_key"
629                ),
630            )
631
632        if display_names:
633            # get display names of dependencies
634            dependencies_display_names = []
635
636            for req in required_dependencies:
637                dependencies_display_names.append(
638                    self.graph.nodes[req][
639                        self.dmr.get_relationship_value("displayName", "node_label")
640                    ]
641                )
642
643            return dependencies_display_names
644
645        return required_dependencies
646
647    def get_nodes_descendants(self, node_label: str) -> list[str]:
648        """Return a list of nodes reachable from source in graph
649        Args:
650            node_label, str: any given node
651        Return:
652            all_descendants, list: nodes reachable from source in graph
653        """
654        all_descendants = list(nx.descendants(self.graph, node_label))
655
656        return all_descendants
657
658    def get_nodes_display_names(
659        self,
660        node_list: list[str],
661    ) -> list[str]:
662        """Get display names associated with the given list of nodes.
663
664        Args:
665            node_list: List of nodes whose display names we need to retrieve.
666
667        Returns:
668            List of display names.
669        """
670        node_list_display_names = [
671            self.graph.nodes[node][
672                self.dmr.get_relationship_value("displayName", "node_label")
673            ]
674            for node in node_list
675        ]
676
677        return node_list_display_names
678
679    def get_node_label(self, node_display_name: str) -> str:
680        """Get the node label for a given display name.
681
682        Args:
683            node_display_name: Display name of the node which you want to get the label for.
684        Returns:
685            Node label associated with given node.
686            If display name not part of schema, return an empty string.
687        """
688
689        node_class_label = get_class_label_from_display_name(
690            display_name=node_display_name
691        )
692        node_property_label = get_property_label_from_display_name(
693            display_name=node_display_name
694        )
695
696        if node_class_label in self.graph.nodes:
697            node_label = node_class_label
698        elif node_property_label in self.graph.nodes:
699            node_label = node_property_label
700        else:
701            node_label = ""
702
703        return node_label
704
705    def get_node_range(
706        self,
707        node_label: Optional[str] = None,
708        node_display_name: Optional[str] = None,
709        display_names: bool = False,
710    ) -> list[str]:
711        """
712        Get the range, i.e., all the valid values that are associated with a node label.
713
714
715        Args:
716            node_label (Optional[str], optional): Node for which you need to retrieve the range.
717              Defaults to None.
718            node_display_name (Optional[str], optional): _description_. Defaults to None.
719            display_names (bool, optional): _description_. Defaults to False.
720
721        Raises:
722            ValueError: If the node cannot be found in the graph.
723
724        Returns:
725            list[str]:
726              If display_names=False, a list of valid values (labels) associated with a given node.
727              If display_names=True, a list of valid values (display names) associated
728                with a given node
729        """
730        node_label = self._get_node_label(node_label, node_display_name)
731        try:
732            # get node range in the order defined in schema for given node
733            required_range = self.find_node_range(node_label=node_label)
734        except KeyError as exc:
735            raise ValueError(
736                f"The source node {node_label} does not exist in the graph. "
737                "Please use a different node."
738            ) from exc
739
740        if display_names:
741            # get the display name(s) of all dependencies
742            dependencies_display_names = []
743
744            for req in required_range:
745                dependencies_display_names.append(self.graph.nodes[req]["displayName"])
746
747            return dependencies_display_names
748
749        return required_range
750
751    def get_node_required(
752        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
753    ) -> bool:
754        """Check if a given node is required or not.
755
756        Note: The possible options that a node can be associated with -- "required" / "optional".
757
758        Args:
759            node_label: Label of the node for which you need to look up.
760            node_display_name: Display name of the node for which you want look up.
761        Returns:
762            True: If the given node is a "required" node.
763            False: If the given node is not a "required" (i.e., an "optional") node.
764        """
765        node_label = self._get_node_label(node_label, node_display_name)
766        rel_node_label = self.dmr.get_relationship_value("required", "node_label")
767        node_required = self.graph.nodes[node_label][rel_node_label]
768        return node_required
769
770    def get_node_validation_rules(
771        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
772    ) -> Union[list, dict[str, str]]:
773        """Get validation rules associated with a node,
774
775        Args:
776            node_label: Label of the node for which you need to look up.
777            node_display_name: Display name of the node which you want to get the label for.
778        Returns:
779            A set of validation rules associated with node, as a list or a dictionary.
780        """
781        node_label = self._get_node_label(node_label, node_display_name)
782
783        if not node_label:
784            return []
785
786        try:
787            node_validation_rules = self.graph.nodes[node_label]["validationRules"]
788        except KeyError as key_error:
789            raise ValueError(
790                f"{node_label} is not in the graph, please provide a proper node label"
791            ) from key_error
792
793        return node_validation_rules
794
795    def get_subgraph_by_edge_type(self, relationship: str) -> nx.DiGraph:
796        """Get a subgraph containing all edges of a given type (aka relationship).
797
798        Args:
799            relationship: edge / link relationship type with possible values same as in above docs.
800
801        Returns:
802            Directed graph on edges of a particular type (aka relationship)
803        """
804
805        # prune the metadata model graph so as to include only those edges that
806        # match the relationship type
807        rel_edges = []
808        for node_1, node_2, key, _ in self.graph.out_edges(data=True, keys=True):
809            if key == relationship:
810                rel_edges.append((node_1, node_2))
811
812        relationship_subgraph: nx.DiGraph = nx.DiGraph()
813        relationship_subgraph.add_edges_from(rel_edges)
814
815        return relationship_subgraph
816
817    def find_adjacent_child_classes(
818        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
819    ) -> list[str]:
820        """Find child classes of a given node.
821        Args:
822            node_display_name: Display name of the node to look up.
823            node_label: Label of the node to look up.
824        Returns:
825            List of nodes that are adjacent to the given node, by SubclassOf relationship.
826        """
827        node_label = self._get_node_label(node_label, node_display_name)
828        return self.get_adjacent_nodes_by_relationship(
829            node_label=node_label,
830            relationship=self.dmr.get_relationship_value("subClassOf", "edge_key"),
831        )
832
833    def find_child_classes(self, schema_class: str) -> list:
834        """Find schema classes that inherit from the given class
835        Args:
836            schema_class: node label for the class to from which to look for children.
837        Returns:
838            list of children to the schema_class.
839        """
840        child_classes = unlist(list(self.graph.successors(schema_class)))
841        assert isinstance(child_classes, list)
842        return child_classes
843
844    def find_class_specific_properties(self, schema_class: str) -> list[str]:
845        """Find properties specifically associated with a given class
846        Args:
847            schema_class, str: node/class label, to identify properties for.
848        Returns:
849            properties, list: List of properties associate with a given schema class.
850        Raises:
851            KeyError: Key error is raised if the provided schema_class is not in the graph
852        """
853
854        if not self.is_class_in_schema(schema_class):
855            raise KeyError(
856                (
857                    f"Schema_class provided: {schema_class} is not in the data model, please check "
858                    "that you are providing the proper class/node label"
859                )
860            )
861
862        properties = []
863        for node1, node2 in self.graph.edges():
864            if (
865                node2 == schema_class
866                and "domainValue" in self.graph[node1][schema_class]
867            ):
868                properties.append(node1)
869        return properties
870
871    def find_parent_classes(self, node_label: str) -> list[list[str]]:
872        """Find all parents of the provided node
873        Args:
874            node_label: label of the node to find parents of
875        Returns:
876            List of list of Parents to the given node.
877        """
878        # Get digraph of nodes with parents
879        digraph = self.get_digraph_by_edge_type("parentOf")
880
881        # Get root node
882        root_node = list(nx.topological_sort(digraph))[0]
883
884        # Get paths between root_node and the target node.
885        paths = nx.all_simple_paths(self.graph, source=root_node, target=node_label)
886
887        return [_path[:-1] for _path in paths]
888
889    def full_schema_graph(self, size: Optional[int] = None) -> graphviz.Digraph:
890        """Create a graph of the data model.
891        Args:
892            size, float: max height and width of the graph, if one value provided
893               it is used for both.
894        Returns:
895            schema graph viz
896        """
897        edges = self.graph.edges()
898        return visualize(edges, size=size)
899
900    def is_class_in_schema(self, node_label: str) -> bool:
901        """Determine if provided node_label is in the schema graph/data model.
902        Args:
903            node_label: label of node to search for in the
904        Returns:
905            True, if node is in the graph schema
906            False, if node is not in graph schema
907        """
908        return node_label in self.graph.nodes()
909
910    def sub_schema_graph(
911        self, source: str, direction: str, size: Optional[float] = None
912    ) -> Optional[graphviz.Digraph]:
913        """Create a sub-schema graph
914        Args:
915            source, str: source node label to start graph
916            direction, str: direction to create the visualization, choose from "up", "down", "both"
917            size, float: max height and width of the graph, if one value provided it is used for
918              both.
919        Returns:
920            Sub-schema graph viz
921        """
922        if direction == "down":
923            edges = list(nx.edge_bfs(self.graph, [source]))
924            return visualize(edges, size=size)
925        if direction == "up":
926            paths = self.find_parent_classes(source)
927            edges = []
928            for _path in paths:
929                _path.append(source)
930                for i in range(0, len(_path) - 1):
931                    edges.append((_path[i], _path[i + 1]))
932            return visualize(edges, size=size)
933        if direction == "both":
934            paths = self.find_parent_classes(source)
935            edges = list(nx.edge_bfs(self.graph, [source]))
936            for _path in paths:
937                _path.append(source)
938                for i in range(0, len(_path) - 1):
939                    edges.append((_path[i], _path[i + 1]))
940            return visualize(edges, size=size)
941        return None
942
943    def get_node_column_type(
944        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
945    ) -> Optional[JSONSchemaType]:
946        """Gets the column type of the node
947
948        Args:
949            node_label: The label of the node to get the type from
950            node_display_name: The display name of the node to get the type from
951
952        Returns:
953            The column type of the node if it has one, otherwise None
954        """
955        node_label = self._get_node_label(node_label, node_display_name)
956        rel_node_label = self.dmr.get_relationship_value("columnType", "node_label")
957        type_string = self.graph.nodes[node_label][rel_node_label]
958        if type_string is None:
959            return type_string
960        return JSONSchemaType(type_string)
961
962    def _get_node_label(
963        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
964    ) -> str:
965        """Returns the node label if given otherwise gets the node label from the display name
966
967        Args:
968            node_label: The label of the node to get the type from
969            node_display_name: The display name of the node to get the type from
970
971        Raises:
972            ValueError: If neither node_label or node_display_name is provided
973
974        Returns:
975            The node label
976        """
977        if node_label is not None:
978            return node_label
979        if node_display_name is not None:
980            return self.get_node_label(node_display_name)
981        raise ValueError("Either 'node_label' or 'node_display_name' must be provided.")
982
983
984def create_data_model_graph_explorer(data_model_path: str) -> DataModelGraphExplorer:
985    """Creates a DataModelGraphExplore using a data model
986
987    Args:
988        data_model_path: The path to a data model to create the dmge
989
990    Returns:
991        DataModelGraphExplorer: A dmge created using the input data model
992    """
993    data_model_parser = DataModelParser(path_to_data_model=data_model_path)
994    parsed_data_model = data_model_parser.parse_model()
995    data_model_grapher = DataModelGraph(parsed_data_model)
996    graph_data_model = data_model_grapher.graph
997    return DataModelGraphExplorer(graph_data_model)
logger = <Logger schematic.schemas.data_model_graph (WARNING)>
tracer = <opentelemetry.sdk.trace.Tracer object>
class DataModelGraphMeta:
33class DataModelGraphMeta:  # pylint: disable=too-few-public-methods
34    """DataModelGraphMeta"""
35
36    _instances: dict = {}
37
38    def __call__(  # pylint: disable=no-self-argument
39        cls, *args: Any, **kwargs: Any
40    ) -> Any:
41        """
42        Possible changes to the value of the `__init__` argument do not affect
43        the returned instance.
44        """
45        if cls not in cls._instances:
46            instance = super().__call__(*args, **kwargs)  # type: ignore # pylint: disable=no-member
47            cls._instances[cls] = instance
48        return cls._instances[cls]

DataModelGraphMeta

class DataModelGraph:
 51class DataModelGraph:  # pylint: disable=too-few-public-methods
 52    """
 53    Generate graph network (networkx) from the attributes and relationships returned
 54    from the data model parser.
 55
 56    Create a singleton.
 57    """
 58
 59    __metaclass__ = DataModelGraphMeta
 60
 61    def __init__(
 62        self,
 63        attribute_relationships_dict: dict,
 64        data_model_labels: DisplayLabelType = "class_label",
 65    ) -> None:
 66        """Load parsed data model.
 67        Args:
 68            attributes_relationship_dict, dict: generated in data_model_parser
 69                {Attribute Display Name: {
 70                        Relationships: {
 71                                    CSV Header: Value}}}
 72            data_model_labels: str, display_label or class_label.
 73                display_label, use the display name as a label, if it is valid
 74                (contains no blacklisted characters) otherwise will default to schema_label.
 75                class_label, default, use standard class or property label.
 76        Raises:
 77            ValueError, attribute_relationship_dict not loaded.
 78        """
 79        self.attribute_relationships_dict = attribute_relationships_dict
 80        self.dmn = DataModelNodes(self.attribute_relationships_dict)
 81        self.dme = DataModelEdges()
 82        self.dmr = DataModelRelationships()
 83        self.data_model_labels = data_model_labels
 84
 85        if not self.attribute_relationships_dict:
 86            raise ValueError(
 87                (
 88                    "Something has gone wrong, a data model was not loaded into the DataModelGraph "
 89                    "Class. Please check that your paths are correct"
 90                )
 91            )
 92        self.graph = self.generate_data_model_graph()
 93
 94    @tracer.start_as_current_span("DataModelGraph::generate_data_model_graph")
 95    def generate_data_model_graph(self) -> nx.MultiDiGraph:
 96        """
 97        Generate NetworkX Graph from the Relationships/attributes dictionary, the graph is built
 98          by first adding all nodes to the graph, then connecting nodes by the relationships defined
 99          in the attributes_relationship dictionary.
100        Returns:
101            G: nx.MultiDiGraph, networkx graph representation of the data model
102        """
103        # Get all relationships with edges
104        edge_relationships = self.dmr.retrieve_rel_headers_dict(edge=True)
105
106        # Find all nodes
107        all_nodes = self.dmn.gather_all_nodes_in_model(
108            attr_rel_dict=self.attribute_relationships_dict
109        )
110
111        # Instantiate NetworkX MultiDigraph
112        graph: nx.MultiDiGraph = nx.MultiDiGraph()
113
114        all_node_dict = {}
115
116        ## Fill in MultiDigraph with nodes
117        for node in all_nodes:
118            # Gather information for each node
119            node_dict = self.dmn.generate_node_dict(
120                node_display_name=node,
121                attr_rel_dict=self.attribute_relationships_dict,
122                data_model_labels=self.data_model_labels,
123            )
124
125            # Add each node to the all_node_dict to be used for generating edges
126            all_node_dict[node] = node_dict
127
128            # Generate node and attach information (attributes) to each node
129            graph = self.dmn.generate_node(graph, node_dict)
130
131        edge_list: list[tuple[str, str, dict[str, Union[str, int]]]] = []
132        ## Connect nodes via edges
133        for node in all_nodes:
134            # Generate edges
135            edge_list_2 = self.dme.generate_edge(
136                node,
137                all_node_dict,
138                self.attribute_relationships_dict,
139                edge_relationships,
140                edge_list,
141            )
142            edge_list = edge_list_2.copy()
143
144        # Add edges to the Graph
145        for node_1, node_2, edge_dict in edge_list:
146            graph.add_edge(
147                node_1, node_2, key=edge_dict["key"], weight=edge_dict["weight"]
148            )
149        return graph

Generate graph network (networkx) from the attributes and relationships returned from the data model parser.

Create a singleton.

DataModelGraph( attribute_relationships_dict: dict, data_model_labels: Literal['class_label', 'display_label'] = 'class_label')
61    def __init__(
62        self,
63        attribute_relationships_dict: dict,
64        data_model_labels: DisplayLabelType = "class_label",
65    ) -> None:
66        """Load parsed data model.
67        Args:
68            attributes_relationship_dict, dict: generated in data_model_parser
69                {Attribute Display Name: {
70                        Relationships: {
71                                    CSV Header: Value}}}
72            data_model_labels: str, display_label or class_label.
73                display_label, use the display name as a label, if it is valid
74                (contains no blacklisted characters) otherwise will default to schema_label.
75                class_label, default, use standard class or property label.
76        Raises:
77            ValueError, attribute_relationship_dict not loaded.
78        """
79        self.attribute_relationships_dict = attribute_relationships_dict
80        self.dmn = DataModelNodes(self.attribute_relationships_dict)
81        self.dme = DataModelEdges()
82        self.dmr = DataModelRelationships()
83        self.data_model_labels = data_model_labels
84
85        if not self.attribute_relationships_dict:
86            raise ValueError(
87                (
88                    "Something has gone wrong, a data model was not loaded into the DataModelGraph "
89                    "Class. Please check that your paths are correct"
90                )
91            )
92        self.graph = self.generate_data_model_graph()

Load parsed data model.

Arguments:
  • attributes_relationship_dict, dict: generated in data_model_parser {Attribute Display Name: { Relationships: { CSV Header: Value}}}
  • data_model_labels: str, display_label or class_label. display_label, use the display name as a label, if it is valid (contains no blacklisted characters) otherwise will default to schema_label. class_label, default, use standard class or property label.
Raises:
  • ValueError, attribute_relationship_dict not loaded.
attribute_relationships_dict
dmn
dme
dmr
data_model_labels
graph
@tracer.start_as_current_span('DataModelGraph::generate_data_model_graph')
def generate_data_model_graph(self) -> networkx.classes.multidigraph.MultiDiGraph:
 94    @tracer.start_as_current_span("DataModelGraph::generate_data_model_graph")
 95    def generate_data_model_graph(self) -> nx.MultiDiGraph:
 96        """
 97        Generate NetworkX Graph from the Relationships/attributes dictionary, the graph is built
 98          by first adding all nodes to the graph, then connecting nodes by the relationships defined
 99          in the attributes_relationship dictionary.
100        Returns:
101            G: nx.MultiDiGraph, networkx graph representation of the data model
102        """
103        # Get all relationships with edges
104        edge_relationships = self.dmr.retrieve_rel_headers_dict(edge=True)
105
106        # Find all nodes
107        all_nodes = self.dmn.gather_all_nodes_in_model(
108            attr_rel_dict=self.attribute_relationships_dict
109        )
110
111        # Instantiate NetworkX MultiDigraph
112        graph: nx.MultiDiGraph = nx.MultiDiGraph()
113
114        all_node_dict = {}
115
116        ## Fill in MultiDigraph with nodes
117        for node in all_nodes:
118            # Gather information for each node
119            node_dict = self.dmn.generate_node_dict(
120                node_display_name=node,
121                attr_rel_dict=self.attribute_relationships_dict,
122                data_model_labels=self.data_model_labels,
123            )
124
125            # Add each node to the all_node_dict to be used for generating edges
126            all_node_dict[node] = node_dict
127
128            # Generate node and attach information (attributes) to each node
129            graph = self.dmn.generate_node(graph, node_dict)
130
131        edge_list: list[tuple[str, str, dict[str, Union[str, int]]]] = []
132        ## Connect nodes via edges
133        for node in all_nodes:
134            # Generate edges
135            edge_list_2 = self.dme.generate_edge(
136                node,
137                all_node_dict,
138                self.attribute_relationships_dict,
139                edge_relationships,
140                edge_list,
141            )
142            edge_list = edge_list_2.copy()
143
144        # Add edges to the Graph
145        for node_1, node_2, edge_dict in edge_list:
146            graph.add_edge(
147                node_1, node_2, key=edge_dict["key"], weight=edge_dict["weight"]
148            )
149        return graph

Generate NetworkX Graph from the Relationships/attributes dictionary, the graph is built by first adding all nodes to the graph, then connecting nodes by the relationships defined in the attributes_relationship dictionary.

Returns:

G: nx.MultiDiGraph, networkx graph representation of the data model

class DataModelGraphExplorer:
152class DataModelGraphExplorer:  # pylint: disable=too-many-public-methods
153    """DataModelGraphExplorer"""
154
155    def __init__(
156        self,
157        graph: nx.MultiDiGraph,
158    ):
159        """Load data model graph as a singleton.
160        Args:
161            G: nx.MultiDiGraph, networkx graph representation of the data model
162        """
163        self.graph = graph  # At this point the graph is expected to be fully formed.
164        self.dmr = DataModelRelationships()
165
166    def find_properties(self) -> set[str]:
167        """
168        Identify all properties, as defined by the first node in a pair, connected with
169        'domainIncludes' edge type
170
171        Returns:
172            properties, set: All properties defined in the data model, each property name
173              is defined by its label.
174        """
175        properties_list: list[str] = []
176        for node_1, _, rel in self.graph.edges:
177            if rel == self.dmr.get_relationship_value("domainIncludes", "edge_key"):
178                properties_list.append(node_1)
179        properties_set = set(properties_list)
180        return properties_set
181
182    def find_classes(self) -> AbstractSet[str]:
183        """
184        Identify all classes, as defined but all nodes, minus all properties
185        (which are explicitly defined)
186        Returns:
187            classes, set:  All classes defined in the data model, each class
188              name is defined by its label.
189        """
190        nodes = self.graph.nodes
191        properties = self.find_properties()
192        classes = nodes - properties
193        return classes
194
195    def find_node_range(
196        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
197    ) -> list:
198        """Get valid values for the given node (attribute)
199        Args:
200            node_label, str, Optional[str]: label of the node for which to retrieve valid values
201            node_display_name, str, Optional[str]: Display Name of the node for which to
202              retrieve valid values
203        Returns:
204            valid_values, list: List of valid values associated with the provided node.
205        """
206        node_label = self._get_node_label(node_label, node_display_name)
207
208        valid_values = []
209        for node_1, node_2, rel in self.graph.edges:
210            if node_1 == node_label and rel == self.dmr.get_relationship_value(
211                "rangeIncludes", "edge_key"
212            ):
213                valid_values.append(node_2)
214        valid_values = list(set(valid_values))
215        return valid_values
216
217    def get_adjacent_nodes_by_relationship(
218        self, node_label: str, relationship: str
219    ) -> list[str]:
220        """Get a list of nodes that is / are adjacent to a given node, based on a relationship type.
221
222        Args:
223            node_label: label of the the node whose edges we need to look at.
224            relationship: the type of link(s) that the above node and its immediate neighbors share.
225
226        Returns:
227            List of nodes that are adjacent to the given node.
228        #checked
229        """
230        nodes = set()
231        for _, node_2, key, _ in self.graph.out_edges(node_label, data=True, keys=True):
232            if key == relationship:
233                nodes.add(node_2)
234
235        return list(nodes)
236
237    def get_component_node_required(
238        self,
239        manifest_component: str,
240        node_validation_rules: Optional[list[str]] = None,
241        node_label: Optional[str] = None,
242        node_display_name: Optional[str] = None,
243    ) -> bool:
244        """Check if a node is required taking into account the manifest component it is defined in
245        (requirements can be set in validation rule as well as required column)
246        Args:
247            manifest_component: str, manifest component display name that the node belongs to.
248            node_validation_rules: list[str], validation rules for a given node and component.
249            node_label: str, Label of the node you would want to get the comment for.
250            node_display_name: str, node display name for the node being queried.
251        Returns:
252            True, if node is required, False if not
253        """
254        node_required = False
255
256        if not node_validation_rules:
257            # Get node validation rules for a given component
258            node_validation_rules = self.get_component_node_validation_rules(
259                manifest_component=manifest_component,
260                node_label=node_label,
261                node_display_name=node_display_name,
262            )
263
264        # Check if the validation rule specifies that the node is required for this particular
265        # component.
266        if rule_in_rule_list("required", node_validation_rules):
267            node_required = True
268            # To prevent any unintended errors, ensure the Required field for this node is False
269            if self.get_node_required(
270                node_label=node_label, node_display_name=node_display_name
271            ):
272                if not node_display_name:
273                    assert node_label is not None
274                    node_display_name = self.graph.nodes[node_label][
275                        self.dmr.get_relationship_value("displayName", "node_label")
276                    ]
277                error_str = " ".join(
278                    [
279                        f"For component: {manifest_component} and attribute: {node_display_name}",
280                        "requirements are being specified in both the Required field and in the",
281                        "Validation Rules. If you desire to use validation rules to set component",
282                        "specific requirements for this attribute",
283                        "then the Required field needs to be set to False, or the validation may",
284                        "not work as intended, for other components where the attribute",
285                        "that should not be required.",
286                    ]
287                )
288
289                logger.error(error_str)
290        else:
291            # If requirements are not being set in the validation rule, then just pull the
292            # standard node requirements from the model
293            node_required = self.get_node_required(
294                node_label=node_label, node_display_name=node_display_name
295            )
296        return node_required
297
298    def get_component_node_validation_rules(
299        self,
300        manifest_component: str,
301        node_label: Optional[str] = None,
302        node_display_name: Optional[str] = None,
303    ) -> list:
304        """Get validation rules for a given node and component.
305        Args:
306            manifest_component: str, manifest component display name that the node belongs to.
307            node_label: str, Label of the node you would want to get the comment for.
308            node_display_name: str, node display name for the node being queried.
309        Returns:
310            validation_rules: list, validation rules list for a given node and component.
311        """
312        # get any additional validation rules associated with this node (e.g. can this node
313        # be mapped to a list of other nodes)
314        node_validation_rules = self.get_node_validation_rules(
315            node_label=node_label, node_display_name=node_display_name
316        )
317
318        # Parse the validation rules per component if applicable
319        if node_validation_rules and isinstance(node_validation_rules, dict):
320            node_validation_rules_list = extract_component_validation_rules(
321                manifest_component=manifest_component,
322                validation_rules_dict=node_validation_rules,  # type: ignore
323            )
324        else:
325            assert isinstance(node_validation_rules, list)
326            node_validation_rules_list = node_validation_rules
327        return node_validation_rules_list
328
329    def get_component_requirements(
330        self,
331        source_component: str,
332    ) -> list[str]:
333        """
334        Get all components that are associated with a given source component and are
335          required by it.
336
337        Args:
338            source_component: source component for which we need to find all required downstream
339              components.
340
341        Returns:
342            List of nodes that are descendants from the source component are are related to the
343              source through a specific component relationship.
344        """
345
346        req_components = list(
347            reversed(
348                self.get_descendants_by_edge_type(
349                    source_component,
350                    self.dmr.get_relationship_value("requiresComponent", "edge_key"),
351                    ordered=True,
352                )
353            )
354        )
355
356        return req_components
357
358    def get_component_requirements_graph(
359        self,
360        source_component: str,
361    ) -> nx.Graph:
362        """
363        Get all components that are associated with a given source component and are required by it;
364          return the components as a dependency graph (i.e. a DAG).
365
366        Args:
367            source_component, str: source component for which we need to find all required
368              downstream components.
369
370        Returns:
371            A subgraph of the schema graph induced on nodes that are descendants from the source
372              component and are related to the source through a specific component relationship.
373        """
374
375        # get a list of required component nodes
376        req_components = self.get_component_requirements(source_component)
377
378        # get the subgraph induced on required component nodes
379        req_components_graph = self.get_subgraph_by_edge_type(
380            self.dmr.get_relationship_value("requiresComponent", "edge_key"),
381        ).subgraph(req_components)
382
383        return req_components_graph
384
385    def get_descendants_by_edge_type(
386        self,
387        source_node: str,
388        relationship: str,
389        connected: bool = True,
390        ordered: bool = False,
391    ) -> list[str]:
392        """
393        Get all nodes that are descendants of a given source node, based on a specific
394          type of edge / relationship type.
395
396        Args:
397            source_node: The node whose descendants need to be retrieved.
398            relationship: Edge / link relationship type with possible values same as in above docs.
399            connected:
400              If True, we need to ensure that all descendant nodes are reachable from the source
401                node, i.e., they are part of the same connected component.
402              If False, the descendants could be in multiple connected components.
403              Default value is True.
404            ordered:
405              If True, the list of descendants will be topologically ordered.
406              If False, the list has no particular order (depends on the order in which the
407                descendants were traversed in the subgraph).
408
409        Returns:
410            List of nodes that are descendants from a particular node (sorted / unsorted)
411        """
412
413        root_descendants = nx.descendants(self.graph, source_node)
414
415        subgraph_nodes = list(root_descendants)
416        subgraph_nodes.append(source_node)
417        descendants_subgraph = self.graph.subgraph(subgraph_nodes)
418
419        # prune the descendants subgraph so as to include only those edges that match
420        # the relationship type
421        rel_edges = []
422        for node_1, node_2, key, _ in descendants_subgraph.edges(data=True, keys=True):
423            if key == relationship:
424                rel_edges.append((node_1, node_2))
425
426        relationship_subgraph: nx.DiGraph = nx.DiGraph()
427        relationship_subgraph.add_edges_from(rel_edges)
428
429        descendants = relationship_subgraph.nodes()
430
431        if not descendants:
432            # return empty list if there are no nodes that are reachable from the
433            # source node based on this relationship type
434            return []
435
436        if connected and ordered:
437            # get the set of reachable nodes from the source node
438            descendants = nx.descendants(relationship_subgraph, source_node)
439            descendants.add(source_node)
440
441            # normally, the descendants from a node are unordered (peculiarity
442            # of nx descendants call)
443            # form the subgraph on descendants and order it topologically
444            # this assumes an acyclic subgraph
445            descendants = nx.topological_sort(
446                relationship_subgraph.subgraph(descendants)
447            )
448        elif connected:
449            # get the nodes that are reachable from a given source node
450            # after the pruning process above some nodes in the
451            # root_descendants subgraph might have become disconnected and
452            # will be omitted
453            descendants = nx.descendants(relationship_subgraph, source_node)
454            descendants.add(source_node)
455        elif ordered:
456            # sort the nodes topologically
457            # this requires the graph to be an acyclic graph
458            descendants = nx.topological_sort(relationship_subgraph)
459
460        return list(descendants)
461
462    def get_digraph_by_edge_type(self, edge_type: str) -> nx.DiGraph:
463        """Get a networkx digraph of the nodes connected via a given edge_type.
464        Args:
465            edge_type:
466                Edge type to search for, possible types are defined by 'edge_key'
467                  in relationship class
468        Returns:
469        """
470
471        digraph: nx.DiGraph = nx.DiGraph()
472        for node_1, node_2, key, _ in self.graph.edges(data=True, keys=True):
473            if key == edge_type:
474                digraph.add_edge(node_1, node_2)
475        return digraph
476
477    def get_edges_by_relationship(
478        self,
479        node: str,
480        relationship: str,
481    ) -> list[tuple[str, str]]:
482        """Get a list of out-edges of a node where the edges match a specific type of relationship.
483
484        i.e., the edges connecting a node to its neighbors are of relationship type -- "parentOf"
485          (set of edges to children / sub-class nodes).
486
487        Args:
488            node: the node whose edges we need to look at.
489            relationship: the type of link(s) that the above node and its immediate neighbors share.
490
491        Returns:
492            List of edges that are connected to the node.
493        """
494        edges: list[tuple[str, str]] = []
495
496        for node_1, node_2, key, _ in self.graph.out_edges(node, data=True, keys=True):
497            if key == relationship:
498                edges.append((node_1, node_2))
499
500        return edges
501
502    def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]:
503        """
504        Order the values associated with a particular node and edge_key to
505          match original ordering in schema.
506
507        Args:
508            key (str): a key representing and edge relationship in
509              DataModelRelationships.relationships_dictionary
510            source_node_label (str): node to look for edges of and order
511
512        Raises:
513            KeyError: cannot find source node in graph
514
515        Returns:
516            list[str]:
517              list of sorted nodes, that share the specified relationship with the source node
518              For the example data model, for key='rangeIncludes', source_node_label='CancerType'
519                the return would be ['Breast, 'Colorectal', 'Lung', 'Prostate', 'Skin'] in that
520                exact order.
521        """
522        # Check if node is in the graph, if not throw an error.
523        if not self.is_class_in_schema(node_label=source_node_label):
524            raise KeyError(
525                f"Cannot find node: {source_node_label} in the graph, please check entry."
526            )
527
528        edge_key = self.dmr.get_relationship_value(key, "edge_key")
529
530        # Handle out edges
531        if self.dmr.get_relationship_value(key, "jsonld_direction") == "out":
532            # use out edges
533
534            original_edge_weights_dict = {
535                attached_node: self.graph[source_node][attached_node][edge_key][
536                    "weight"
537                ]
538                for source_node, attached_node in self.graph.out_edges(
539                    source_node_label
540                )
541                if edge_key in self.graph[source_node][attached_node]
542            }
543        # Handle in edges
544        else:
545            # use in edges
546            original_edge_weights_dict = {
547                attached_node: self.graph[attached_node][source_node][edge_key][
548                    "weight"
549                ]
550                for attached_node, source_node in self.graph.in_edges(source_node_label)
551                if edge_key in self.graph[attached_node][source_node]
552            }
553
554        sorted_nodes = list(
555            dict(
556                sorted(original_edge_weights_dict.items(), key=lambda item: item[1])
557            ).keys()
558        )
559
560        return sorted_nodes
561
562    # Get values associated with a node
563    def get_nodes_ancestors(self, subgraph: nx.DiGraph, node_label: str) -> list[str]:
564        """Get a list of nodes reachable from source component in graph
565
566        Args:
567            subgraph (nx.DiGraph): networkx graph object
568            node_label (str): label of node to find ancestors for
569
570        Returns:
571            list[str]: nodes reachable from source in graph
572        """
573        all_ancestors = list(nx.ancestors(subgraph, node_label))
574
575        return all_ancestors
576
577    def get_node_comment(
578        self, node_display_name: Optional[str] = None, node_label: Optional[str] = None
579    ) -> str:
580        """Get the node definition, i.e., the "comment" associated with a given node display name.
581
582        Args:
583            node_display_name, str: Display name of the node which you want to get the comment for.
584            node_label, str: Label of the node you would want to get the comment for.
585        Returns:
586            Comment associated with node, as a string.
587        """
588        node_label = self._get_node_label(node_label, node_display_name)
589
590        if not node_label:
591            return ""
592
593        node_definition = self.graph.nodes[node_label][
594            self.dmr.get_relationship_value("comment", "node_label")
595        ]
596        return node_definition
597
598    def get_node_dependencies(
599        self,
600        source_node: str,
601        display_names: bool = True,
602        schema_ordered: bool = True,
603    ) -> list[str]:
604        """Get the immediate dependencies that are related to a given source node.
605
606        Args:
607            source_node: The node whose dependencies we need to compute.
608            display_names: if True, return list of display names of each of the dependencies.
609                           if False, return list of node labels of each of the dependencies.
610            schema_ordered:
611              if True, return the dependencies of the node following the order of the schema
612                (slower).
613              if False, return dependencies from graph without guaranteeing schema order (faster)
614
615        Returns:
616            List of nodes that are dependent on the source node.
617        """
618
619        if schema_ordered:
620            # get dependencies in the same order in which they are defined in the schema
621            required_dependencies = self.get_ordered_entry(
622                key=self.dmr.get_relationship_value("requiresDependency", "edge_key"),
623                source_node_label=source_node,
624            )
625        else:
626            required_dependencies = self.get_adjacent_nodes_by_relationship(
627                node_label=source_node,
628                relationship=self.dmr.get_relationship_value(
629                    "requiresDependency", "edge_key"
630                ),
631            )
632
633        if display_names:
634            # get display names of dependencies
635            dependencies_display_names = []
636
637            for req in required_dependencies:
638                dependencies_display_names.append(
639                    self.graph.nodes[req][
640                        self.dmr.get_relationship_value("displayName", "node_label")
641                    ]
642                )
643
644            return dependencies_display_names
645
646        return required_dependencies
647
648    def get_nodes_descendants(self, node_label: str) -> list[str]:
649        """Return a list of nodes reachable from source in graph
650        Args:
651            node_label, str: any given node
652        Return:
653            all_descendants, list: nodes reachable from source in graph
654        """
655        all_descendants = list(nx.descendants(self.graph, node_label))
656
657        return all_descendants
658
659    def get_nodes_display_names(
660        self,
661        node_list: list[str],
662    ) -> list[str]:
663        """Get display names associated with the given list of nodes.
664
665        Args:
666            node_list: List of nodes whose display names we need to retrieve.
667
668        Returns:
669            List of display names.
670        """
671        node_list_display_names = [
672            self.graph.nodes[node][
673                self.dmr.get_relationship_value("displayName", "node_label")
674            ]
675            for node in node_list
676        ]
677
678        return node_list_display_names
679
680    def get_node_label(self, node_display_name: str) -> str:
681        """Get the node label for a given display name.
682
683        Args:
684            node_display_name: Display name of the node which you want to get the label for.
685        Returns:
686            Node label associated with given node.
687            If display name not part of schema, return an empty string.
688        """
689
690        node_class_label = get_class_label_from_display_name(
691            display_name=node_display_name
692        )
693        node_property_label = get_property_label_from_display_name(
694            display_name=node_display_name
695        )
696
697        if node_class_label in self.graph.nodes:
698            node_label = node_class_label
699        elif node_property_label in self.graph.nodes:
700            node_label = node_property_label
701        else:
702            node_label = ""
703
704        return node_label
705
706    def get_node_range(
707        self,
708        node_label: Optional[str] = None,
709        node_display_name: Optional[str] = None,
710        display_names: bool = False,
711    ) -> list[str]:
712        """
713        Get the range, i.e., all the valid values that are associated with a node label.
714
715
716        Args:
717            node_label (Optional[str], optional): Node for which you need to retrieve the range.
718              Defaults to None.
719            node_display_name (Optional[str], optional): _description_. Defaults to None.
720            display_names (bool, optional): _description_. Defaults to False.
721
722        Raises:
723            ValueError: If the node cannot be found in the graph.
724
725        Returns:
726            list[str]:
727              If display_names=False, a list of valid values (labels) associated with a given node.
728              If display_names=True, a list of valid values (display names) associated
729                with a given node
730        """
731        node_label = self._get_node_label(node_label, node_display_name)
732        try:
733            # get node range in the order defined in schema for given node
734            required_range = self.find_node_range(node_label=node_label)
735        except KeyError as exc:
736            raise ValueError(
737                f"The source node {node_label} does not exist in the graph. "
738                "Please use a different node."
739            ) from exc
740
741        if display_names:
742            # get the display name(s) of all dependencies
743            dependencies_display_names = []
744
745            for req in required_range:
746                dependencies_display_names.append(self.graph.nodes[req]["displayName"])
747
748            return dependencies_display_names
749
750        return required_range
751
752    def get_node_required(
753        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
754    ) -> bool:
755        """Check if a given node is required or not.
756
757        Note: The possible options that a node can be associated with -- "required" / "optional".
758
759        Args:
760            node_label: Label of the node for which you need to look up.
761            node_display_name: Display name of the node for which you want look up.
762        Returns:
763            True: If the given node is a "required" node.
764            False: If the given node is not a "required" (i.e., an "optional") node.
765        """
766        node_label = self._get_node_label(node_label, node_display_name)
767        rel_node_label = self.dmr.get_relationship_value("required", "node_label")
768        node_required = self.graph.nodes[node_label][rel_node_label]
769        return node_required
770
771    def get_node_validation_rules(
772        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
773    ) -> Union[list, dict[str, str]]:
774        """Get validation rules associated with a node,
775
776        Args:
777            node_label: Label of the node for which you need to look up.
778            node_display_name: Display name of the node which you want to get the label for.
779        Returns:
780            A set of validation rules associated with node, as a list or a dictionary.
781        """
782        node_label = self._get_node_label(node_label, node_display_name)
783
784        if not node_label:
785            return []
786
787        try:
788            node_validation_rules = self.graph.nodes[node_label]["validationRules"]
789        except KeyError as key_error:
790            raise ValueError(
791                f"{node_label} is not in the graph, please provide a proper node label"
792            ) from key_error
793
794        return node_validation_rules
795
796    def get_subgraph_by_edge_type(self, relationship: str) -> nx.DiGraph:
797        """Get a subgraph containing all edges of a given type (aka relationship).
798
799        Args:
800            relationship: edge / link relationship type with possible values same as in above docs.
801
802        Returns:
803            Directed graph on edges of a particular type (aka relationship)
804        """
805
806        # prune the metadata model graph so as to include only those edges that
807        # match the relationship type
808        rel_edges = []
809        for node_1, node_2, key, _ in self.graph.out_edges(data=True, keys=True):
810            if key == relationship:
811                rel_edges.append((node_1, node_2))
812
813        relationship_subgraph: nx.DiGraph = nx.DiGraph()
814        relationship_subgraph.add_edges_from(rel_edges)
815
816        return relationship_subgraph
817
818    def find_adjacent_child_classes(
819        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
820    ) -> list[str]:
821        """Find child classes of a given node.
822        Args:
823            node_display_name: Display name of the node to look up.
824            node_label: Label of the node to look up.
825        Returns:
826            List of nodes that are adjacent to the given node, by SubclassOf relationship.
827        """
828        node_label = self._get_node_label(node_label, node_display_name)
829        return self.get_adjacent_nodes_by_relationship(
830            node_label=node_label,
831            relationship=self.dmr.get_relationship_value("subClassOf", "edge_key"),
832        )
833
834    def find_child_classes(self, schema_class: str) -> list:
835        """Find schema classes that inherit from the given class
836        Args:
837            schema_class: node label for the class to from which to look for children.
838        Returns:
839            list of children to the schema_class.
840        """
841        child_classes = unlist(list(self.graph.successors(schema_class)))
842        assert isinstance(child_classes, list)
843        return child_classes
844
845    def find_class_specific_properties(self, schema_class: str) -> list[str]:
846        """Find properties specifically associated with a given class
847        Args:
848            schema_class, str: node/class label, to identify properties for.
849        Returns:
850            properties, list: List of properties associate with a given schema class.
851        Raises:
852            KeyError: Key error is raised if the provided schema_class is not in the graph
853        """
854
855        if not self.is_class_in_schema(schema_class):
856            raise KeyError(
857                (
858                    f"Schema_class provided: {schema_class} is not in the data model, please check "
859                    "that you are providing the proper class/node label"
860                )
861            )
862
863        properties = []
864        for node1, node2 in self.graph.edges():
865            if (
866                node2 == schema_class
867                and "domainValue" in self.graph[node1][schema_class]
868            ):
869                properties.append(node1)
870        return properties
871
872    def find_parent_classes(self, node_label: str) -> list[list[str]]:
873        """Find all parents of the provided node
874        Args:
875            node_label: label of the node to find parents of
876        Returns:
877            List of list of Parents to the given node.
878        """
879        # Get digraph of nodes with parents
880        digraph = self.get_digraph_by_edge_type("parentOf")
881
882        # Get root node
883        root_node = list(nx.topological_sort(digraph))[0]
884
885        # Get paths between root_node and the target node.
886        paths = nx.all_simple_paths(self.graph, source=root_node, target=node_label)
887
888        return [_path[:-1] for _path in paths]
889
890    def full_schema_graph(self, size: Optional[int] = None) -> graphviz.Digraph:
891        """Create a graph of the data model.
892        Args:
893            size, float: max height and width of the graph, if one value provided
894               it is used for both.
895        Returns:
896            schema graph viz
897        """
898        edges = self.graph.edges()
899        return visualize(edges, size=size)
900
901    def is_class_in_schema(self, node_label: str) -> bool:
902        """Determine if provided node_label is in the schema graph/data model.
903        Args:
904            node_label: label of node to search for in the
905        Returns:
906            True, if node is in the graph schema
907            False, if node is not in graph schema
908        """
909        return node_label in self.graph.nodes()
910
911    def sub_schema_graph(
912        self, source: str, direction: str, size: Optional[float] = None
913    ) -> Optional[graphviz.Digraph]:
914        """Create a sub-schema graph
915        Args:
916            source, str: source node label to start graph
917            direction, str: direction to create the visualization, choose from "up", "down", "both"
918            size, float: max height and width of the graph, if one value provided it is used for
919              both.
920        Returns:
921            Sub-schema graph viz
922        """
923        if direction == "down":
924            edges = list(nx.edge_bfs(self.graph, [source]))
925            return visualize(edges, size=size)
926        if direction == "up":
927            paths = self.find_parent_classes(source)
928            edges = []
929            for _path in paths:
930                _path.append(source)
931                for i in range(0, len(_path) - 1):
932                    edges.append((_path[i], _path[i + 1]))
933            return visualize(edges, size=size)
934        if direction == "both":
935            paths = self.find_parent_classes(source)
936            edges = list(nx.edge_bfs(self.graph, [source]))
937            for _path in paths:
938                _path.append(source)
939                for i in range(0, len(_path) - 1):
940                    edges.append((_path[i], _path[i + 1]))
941            return visualize(edges, size=size)
942        return None
943
944    def get_node_column_type(
945        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
946    ) -> Optional[JSONSchemaType]:
947        """Gets the column type of the node
948
949        Args:
950            node_label: The label of the node to get the type from
951            node_display_name: The display name of the node to get the type from
952
953        Returns:
954            The column type of the node if it has one, otherwise None
955        """
956        node_label = self._get_node_label(node_label, node_display_name)
957        rel_node_label = self.dmr.get_relationship_value("columnType", "node_label")
958        type_string = self.graph.nodes[node_label][rel_node_label]
959        if type_string is None:
960            return type_string
961        return JSONSchemaType(type_string)
962
963    def _get_node_label(
964        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
965    ) -> str:
966        """Returns the node label if given otherwise gets the node label from the display name
967
968        Args:
969            node_label: The label of the node to get the type from
970            node_display_name: The display name of the node to get the type from
971
972        Raises:
973            ValueError: If neither node_label or node_display_name is provided
974
975        Returns:
976            The node label
977        """
978        if node_label is not None:
979            return node_label
980        if node_display_name is not None:
981            return self.get_node_label(node_display_name)
982        raise ValueError("Either 'node_label' or 'node_display_name' must be provided.")

DataModelGraphExplorer

DataModelGraphExplorer(graph: networkx.classes.multidigraph.MultiDiGraph)
155    def __init__(
156        self,
157        graph: nx.MultiDiGraph,
158    ):
159        """Load data model graph as a singleton.
160        Args:
161            G: nx.MultiDiGraph, networkx graph representation of the data model
162        """
163        self.graph = graph  # At this point the graph is expected to be fully formed.
164        self.dmr = DataModelRelationships()

Load data model graph as a singleton.

Arguments:
  • G: nx.MultiDiGraph, networkx graph representation of the data model
graph
dmr
def find_properties(self) -> set[str]:
166    def find_properties(self) -> set[str]:
167        """
168        Identify all properties, as defined by the first node in a pair, connected with
169        'domainIncludes' edge type
170
171        Returns:
172            properties, set: All properties defined in the data model, each property name
173              is defined by its label.
174        """
175        properties_list: list[str] = []
176        for node_1, _, rel in self.graph.edges:
177            if rel == self.dmr.get_relationship_value("domainIncludes", "edge_key"):
178                properties_list.append(node_1)
179        properties_set = set(properties_list)
180        return properties_set

Identify all properties, as defined by the first node in a pair, connected with 'domainIncludes' edge type

Returns:

properties, set: All properties defined in the data model, each property name is defined by its label.

def find_classes(self) -> AbstractSet[str]:
182    def find_classes(self) -> AbstractSet[str]:
183        """
184        Identify all classes, as defined but all nodes, minus all properties
185        (which are explicitly defined)
186        Returns:
187            classes, set:  All classes defined in the data model, each class
188              name is defined by its label.
189        """
190        nodes = self.graph.nodes
191        properties = self.find_properties()
192        classes = nodes - properties
193        return classes

Identify all classes, as defined but all nodes, minus all properties (which are explicitly defined)

Returns:

classes, set: All classes defined in the data model, each class name is defined by its label.

def find_node_range( self, node_label: Optional[str] = None, node_display_name: Optional[str] = None) -> list:
195    def find_node_range(
196        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
197    ) -> list:
198        """Get valid values for the given node (attribute)
199        Args:
200            node_label, str, Optional[str]: label of the node for which to retrieve valid values
201            node_display_name, str, Optional[str]: Display Name of the node for which to
202              retrieve valid values
203        Returns:
204            valid_values, list: List of valid values associated with the provided node.
205        """
206        node_label = self._get_node_label(node_label, node_display_name)
207
208        valid_values = []
209        for node_1, node_2, rel in self.graph.edges:
210            if node_1 == node_label and rel == self.dmr.get_relationship_value(
211                "rangeIncludes", "edge_key"
212            ):
213                valid_values.append(node_2)
214        valid_values = list(set(valid_values))
215        return valid_values

Get valid values for the given node (attribute)

Arguments:
  • node_label, str, Optional[str]: label of the node for which to retrieve valid values
  • node_display_name, str, Optional[str]: Display Name of the node for which to retrieve valid values
Returns:

valid_values, list: List of valid values associated with the provided node.

def get_adjacent_nodes_by_relationship(self, node_label: str, relationship: str) -> list[str]:
217    def get_adjacent_nodes_by_relationship(
218        self, node_label: str, relationship: str
219    ) -> list[str]:
220        """Get a list of nodes that is / are adjacent to a given node, based on a relationship type.
221
222        Args:
223            node_label: label of the the node whose edges we need to look at.
224            relationship: the type of link(s) that the above node and its immediate neighbors share.
225
226        Returns:
227            List of nodes that are adjacent to the given node.
228        #checked
229        """
230        nodes = set()
231        for _, node_2, key, _ in self.graph.out_edges(node_label, data=True, keys=True):
232            if key == relationship:
233                nodes.add(node_2)
234
235        return list(nodes)

Get a list of nodes that is / are adjacent to a given node, based on a relationship type.

Arguments:
  • node_label: label of the the node whose edges we need to look at.
  • relationship: the type of link(s) that the above node and its immediate neighbors share.
Returns:

List of nodes that are adjacent to the given node.

checked

def get_component_node_required( self, manifest_component: str, node_validation_rules: Optional[list[str]] = None, node_label: Optional[str] = None, node_display_name: Optional[str] = None) -> bool:
237    def get_component_node_required(
238        self,
239        manifest_component: str,
240        node_validation_rules: Optional[list[str]] = None,
241        node_label: Optional[str] = None,
242        node_display_name: Optional[str] = None,
243    ) -> bool:
244        """Check if a node is required taking into account the manifest component it is defined in
245        (requirements can be set in validation rule as well as required column)
246        Args:
247            manifest_component: str, manifest component display name that the node belongs to.
248            node_validation_rules: list[str], validation rules for a given node and component.
249            node_label: str, Label of the node you would want to get the comment for.
250            node_display_name: str, node display name for the node being queried.
251        Returns:
252            True, if node is required, False if not
253        """
254        node_required = False
255
256        if not node_validation_rules:
257            # Get node validation rules for a given component
258            node_validation_rules = self.get_component_node_validation_rules(
259                manifest_component=manifest_component,
260                node_label=node_label,
261                node_display_name=node_display_name,
262            )
263
264        # Check if the validation rule specifies that the node is required for this particular
265        # component.
266        if rule_in_rule_list("required", node_validation_rules):
267            node_required = True
268            # To prevent any unintended errors, ensure the Required field for this node is False
269            if self.get_node_required(
270                node_label=node_label, node_display_name=node_display_name
271            ):
272                if not node_display_name:
273                    assert node_label is not None
274                    node_display_name = self.graph.nodes[node_label][
275                        self.dmr.get_relationship_value("displayName", "node_label")
276                    ]
277                error_str = " ".join(
278                    [
279                        f"For component: {manifest_component} and attribute: {node_display_name}",
280                        "requirements are being specified in both the Required field and in the",
281                        "Validation Rules. If you desire to use validation rules to set component",
282                        "specific requirements for this attribute",
283                        "then the Required field needs to be set to False, or the validation may",
284                        "not work as intended, for other components where the attribute",
285                        "that should not be required.",
286                    ]
287                )
288
289                logger.error(error_str)
290        else:
291            # If requirements are not being set in the validation rule, then just pull the
292            # standard node requirements from the model
293            node_required = self.get_node_required(
294                node_label=node_label, node_display_name=node_display_name
295            )
296        return node_required

Check if a node is required taking into account the manifest component it is defined in (requirements can be set in validation rule as well as required column)

Arguments:
  • manifest_component: str, manifest component display name that the node belongs to.
  • node_validation_rules: list[str], validation rules for a given node and component.
  • node_label: str, Label of the node you would want to get the comment for.
  • node_display_name: str, node display name for the node being queried.
Returns:

True, if node is required, False if not

def get_component_node_validation_rules( self, manifest_component: str, node_label: Optional[str] = None, node_display_name: Optional[str] = None) -> list:
298    def get_component_node_validation_rules(
299        self,
300        manifest_component: str,
301        node_label: Optional[str] = None,
302        node_display_name: Optional[str] = None,
303    ) -> list:
304        """Get validation rules for a given node and component.
305        Args:
306            manifest_component: str, manifest component display name that the node belongs to.
307            node_label: str, Label of the node you would want to get the comment for.
308            node_display_name: str, node display name for the node being queried.
309        Returns:
310            validation_rules: list, validation rules list for a given node and component.
311        """
312        # get any additional validation rules associated with this node (e.g. can this node
313        # be mapped to a list of other nodes)
314        node_validation_rules = self.get_node_validation_rules(
315            node_label=node_label, node_display_name=node_display_name
316        )
317
318        # Parse the validation rules per component if applicable
319        if node_validation_rules and isinstance(node_validation_rules, dict):
320            node_validation_rules_list = extract_component_validation_rules(
321                manifest_component=manifest_component,
322                validation_rules_dict=node_validation_rules,  # type: ignore
323            )
324        else:
325            assert isinstance(node_validation_rules, list)
326            node_validation_rules_list = node_validation_rules
327        return node_validation_rules_list

Get validation rules for a given node and component.

Arguments:
  • manifest_component: str, manifest component display name that the node belongs to.
  • node_label: str, Label of the node you would want to get the comment for.
  • node_display_name: str, node display name for the node being queried.
Returns:

validation_rules: list, validation rules list for a given node and component.

def get_component_requirements(self, source_component: str) -> list[str]:
329    def get_component_requirements(
330        self,
331        source_component: str,
332    ) -> list[str]:
333        """
334        Get all components that are associated with a given source component and are
335          required by it.
336
337        Args:
338            source_component: source component for which we need to find all required downstream
339              components.
340
341        Returns:
342            List of nodes that are descendants from the source component are are related to the
343              source through a specific component relationship.
344        """
345
346        req_components = list(
347            reversed(
348                self.get_descendants_by_edge_type(
349                    source_component,
350                    self.dmr.get_relationship_value("requiresComponent", "edge_key"),
351                    ordered=True,
352                )
353            )
354        )
355
356        return req_components

Get all components that are associated with a given source component and are required by it.

Arguments:
  • source_component: source component for which we need to find all required downstream components.
Returns:

List of nodes that are descendants from the source component are are related to the source through a specific component relationship.

def get_component_requirements_graph(self, source_component: str) -> networkx.classes.graph.Graph:
358    def get_component_requirements_graph(
359        self,
360        source_component: str,
361    ) -> nx.Graph:
362        """
363        Get all components that are associated with a given source component and are required by it;
364          return the components as a dependency graph (i.e. a DAG).
365
366        Args:
367            source_component, str: source component for which we need to find all required
368              downstream components.
369
370        Returns:
371            A subgraph of the schema graph induced on nodes that are descendants from the source
372              component and are related to the source through a specific component relationship.
373        """
374
375        # get a list of required component nodes
376        req_components = self.get_component_requirements(source_component)
377
378        # get the subgraph induced on required component nodes
379        req_components_graph = self.get_subgraph_by_edge_type(
380            self.dmr.get_relationship_value("requiresComponent", "edge_key"),
381        ).subgraph(req_components)
382
383        return req_components_graph

Get all components that are associated with a given source component and are required by it; return the components as a dependency graph (i.e. a DAG).

Arguments:
  • source_component, str: source component for which we need to find all required downstream components.
Returns:

A subgraph of the schema graph induced on nodes that are descendants from the source component and are related to the source through a specific component relationship.

def get_descendants_by_edge_type( self, source_node: str, relationship: str, connected: bool = True, ordered: bool = False) -> list[str]:
385    def get_descendants_by_edge_type(
386        self,
387        source_node: str,
388        relationship: str,
389        connected: bool = True,
390        ordered: bool = False,
391    ) -> list[str]:
392        """
393        Get all nodes that are descendants of a given source node, based on a specific
394          type of edge / relationship type.
395
396        Args:
397            source_node: The node whose descendants need to be retrieved.
398            relationship: Edge / link relationship type with possible values same as in above docs.
399            connected:
400              If True, we need to ensure that all descendant nodes are reachable from the source
401                node, i.e., they are part of the same connected component.
402              If False, the descendants could be in multiple connected components.
403              Default value is True.
404            ordered:
405              If True, the list of descendants will be topologically ordered.
406              If False, the list has no particular order (depends on the order in which the
407                descendants were traversed in the subgraph).
408
409        Returns:
410            List of nodes that are descendants from a particular node (sorted / unsorted)
411        """
412
413        root_descendants = nx.descendants(self.graph, source_node)
414
415        subgraph_nodes = list(root_descendants)
416        subgraph_nodes.append(source_node)
417        descendants_subgraph = self.graph.subgraph(subgraph_nodes)
418
419        # prune the descendants subgraph so as to include only those edges that match
420        # the relationship type
421        rel_edges = []
422        for node_1, node_2, key, _ in descendants_subgraph.edges(data=True, keys=True):
423            if key == relationship:
424                rel_edges.append((node_1, node_2))
425
426        relationship_subgraph: nx.DiGraph = nx.DiGraph()
427        relationship_subgraph.add_edges_from(rel_edges)
428
429        descendants = relationship_subgraph.nodes()
430
431        if not descendants:
432            # return empty list if there are no nodes that are reachable from the
433            # source node based on this relationship type
434            return []
435
436        if connected and ordered:
437            # get the set of reachable nodes from the source node
438            descendants = nx.descendants(relationship_subgraph, source_node)
439            descendants.add(source_node)
440
441            # normally, the descendants from a node are unordered (peculiarity
442            # of nx descendants call)
443            # form the subgraph on descendants and order it topologically
444            # this assumes an acyclic subgraph
445            descendants = nx.topological_sort(
446                relationship_subgraph.subgraph(descendants)
447            )
448        elif connected:
449            # get the nodes that are reachable from a given source node
450            # after the pruning process above some nodes in the
451            # root_descendants subgraph might have become disconnected and
452            # will be omitted
453            descendants = nx.descendants(relationship_subgraph, source_node)
454            descendants.add(source_node)
455        elif ordered:
456            # sort the nodes topologically
457            # this requires the graph to be an acyclic graph
458            descendants = nx.topological_sort(relationship_subgraph)
459
460        return list(descendants)

Get all nodes that are descendants of a given source node, based on a specific type of edge / relationship type.

Arguments:
  • source_node: The node whose descendants need to be retrieved.
  • relationship: Edge / link relationship type with possible values same as in above docs.
  • connected: If True, we need to ensure that all descendant nodes are reachable from the source node, i.e., they are part of the same connected component. If False, the descendants could be in multiple connected components. Default value is True.
  • ordered: If True, the list of descendants will be topologically ordered. If False, the list has no particular order (depends on the order in which the descendants were traversed in the subgraph).
Returns:

List of nodes that are descendants from a particular node (sorted / unsorted)

def get_digraph_by_edge_type(self, edge_type: str) -> networkx.classes.digraph.DiGraph:
462    def get_digraph_by_edge_type(self, edge_type: str) -> nx.DiGraph:
463        """Get a networkx digraph of the nodes connected via a given edge_type.
464        Args:
465            edge_type:
466                Edge type to search for, possible types are defined by 'edge_key'
467                  in relationship class
468        Returns:
469        """
470
471        digraph: nx.DiGraph = nx.DiGraph()
472        for node_1, node_2, key, _ in self.graph.edges(data=True, keys=True):
473            if key == edge_type:
474                digraph.add_edge(node_1, node_2)
475        return digraph

Get a networkx digraph of the nodes connected via a given edge_type.

Arguments:
  • edge_type: Edge type to search for, possible types are defined by 'edge_key' in relationship class

Returns:

def get_edges_by_relationship(self, node: str, relationship: str) -> list[tuple[str, str]]:
477    def get_edges_by_relationship(
478        self,
479        node: str,
480        relationship: str,
481    ) -> list[tuple[str, str]]:
482        """Get a list of out-edges of a node where the edges match a specific type of relationship.
483
484        i.e., the edges connecting a node to its neighbors are of relationship type -- "parentOf"
485          (set of edges to children / sub-class nodes).
486
487        Args:
488            node: the node whose edges we need to look at.
489            relationship: the type of link(s) that the above node and its immediate neighbors share.
490
491        Returns:
492            List of edges that are connected to the node.
493        """
494        edges: list[tuple[str, str]] = []
495
496        for node_1, node_2, key, _ in self.graph.out_edges(node, data=True, keys=True):
497            if key == relationship:
498                edges.append((node_1, node_2))
499
500        return edges

Get a list of out-edges of a node where the edges match a specific type of relationship.

i.e., the edges connecting a node to its neighbors are of relationship type -- "parentOf" (set of edges to children / sub-class nodes).

Arguments:
  • node: the node whose edges we need to look at.
  • relationship: the type of link(s) that the above node and its immediate neighbors share.
Returns:

List of edges that are connected to the node.

def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]:
502    def get_ordered_entry(self, key: str, source_node_label: str) -> list[str]:
503        """
504        Order the values associated with a particular node and edge_key to
505          match original ordering in schema.
506
507        Args:
508            key (str): a key representing and edge relationship in
509              DataModelRelationships.relationships_dictionary
510            source_node_label (str): node to look for edges of and order
511
512        Raises:
513            KeyError: cannot find source node in graph
514
515        Returns:
516            list[str]:
517              list of sorted nodes, that share the specified relationship with the source node
518              For the example data model, for key='rangeIncludes', source_node_label='CancerType'
519                the return would be ['Breast, 'Colorectal', 'Lung', 'Prostate', 'Skin'] in that
520                exact order.
521        """
522        # Check if node is in the graph, if not throw an error.
523        if not self.is_class_in_schema(node_label=source_node_label):
524            raise KeyError(
525                f"Cannot find node: {source_node_label} in the graph, please check entry."
526            )
527
528        edge_key = self.dmr.get_relationship_value(key, "edge_key")
529
530        # Handle out edges
531        if self.dmr.get_relationship_value(key, "jsonld_direction") == "out":
532            # use out edges
533
534            original_edge_weights_dict = {
535                attached_node: self.graph[source_node][attached_node][edge_key][
536                    "weight"
537                ]
538                for source_node, attached_node in self.graph.out_edges(
539                    source_node_label
540                )
541                if edge_key in self.graph[source_node][attached_node]
542            }
543        # Handle in edges
544        else:
545            # use in edges
546            original_edge_weights_dict = {
547                attached_node: self.graph[attached_node][source_node][edge_key][
548                    "weight"
549                ]
550                for attached_node, source_node in self.graph.in_edges(source_node_label)
551                if edge_key in self.graph[attached_node][source_node]
552            }
553
554        sorted_nodes = list(
555            dict(
556                sorted(original_edge_weights_dict.items(), key=lambda item: item[1])
557            ).keys()
558        )
559
560        return sorted_nodes

Order the values associated with a particular node and edge_key to match original ordering in schema.

Arguments:
  • key (str): a key representing and edge relationship in DataModelRelationships.relationships_dictionary
  • source_node_label (str): node to look for edges of and order
Raises:
  • KeyError: cannot find source node in graph
Returns:

list[str]: list of sorted nodes, that share the specified relationship with the source node For the example data model, for key='rangeIncludes', source_node_label='CancerType' the return would be ['Breast, 'Colorectal', 'Lung', 'Prostate', 'Skin'] in that exact order.

def get_nodes_ancestors( self, subgraph: networkx.classes.digraph.DiGraph, node_label: str) -> list[str]:
563    def get_nodes_ancestors(self, subgraph: nx.DiGraph, node_label: str) -> list[str]:
564        """Get a list of nodes reachable from source component in graph
565
566        Args:
567            subgraph (nx.DiGraph): networkx graph object
568            node_label (str): label of node to find ancestors for
569
570        Returns:
571            list[str]: nodes reachable from source in graph
572        """
573        all_ancestors = list(nx.ancestors(subgraph, node_label))
574
575        return all_ancestors

Get a list of nodes reachable from source component in graph

Arguments:
  • subgraph (nx.DiGraph): networkx graph object
  • node_label (str): label of node to find ancestors for
Returns:

list[str]: nodes reachable from source in graph

def get_node_comment( self, node_display_name: Optional[str] = None, node_label: Optional[str] = None) -> str:
577    def get_node_comment(
578        self, node_display_name: Optional[str] = None, node_label: Optional[str] = None
579    ) -> str:
580        """Get the node definition, i.e., the "comment" associated with a given node display name.
581
582        Args:
583            node_display_name, str: Display name of the node which you want to get the comment for.
584            node_label, str: Label of the node you would want to get the comment for.
585        Returns:
586            Comment associated with node, as a string.
587        """
588        node_label = self._get_node_label(node_label, node_display_name)
589
590        if not node_label:
591            return ""
592
593        node_definition = self.graph.nodes[node_label][
594            self.dmr.get_relationship_value("comment", "node_label")
595        ]
596        return node_definition

Get the node definition, i.e., the "comment" associated with a given node display name.

Arguments:
  • node_display_name, str: Display name of the node which you want to get the comment for.
  • node_label, str: Label of the node you would want to get the comment for.
Returns:

Comment associated with node, as a string.

def get_node_dependencies( self, source_node: str, display_names: bool = True, schema_ordered: bool = True) -> list[str]:
598    def get_node_dependencies(
599        self,
600        source_node: str,
601        display_names: bool = True,
602        schema_ordered: bool = True,
603    ) -> list[str]:
604        """Get the immediate dependencies that are related to a given source node.
605
606        Args:
607            source_node: The node whose dependencies we need to compute.
608            display_names: if True, return list of display names of each of the dependencies.
609                           if False, return list of node labels of each of the dependencies.
610            schema_ordered:
611              if True, return the dependencies of the node following the order of the schema
612                (slower).
613              if False, return dependencies from graph without guaranteeing schema order (faster)
614
615        Returns:
616            List of nodes that are dependent on the source node.
617        """
618
619        if schema_ordered:
620            # get dependencies in the same order in which they are defined in the schema
621            required_dependencies = self.get_ordered_entry(
622                key=self.dmr.get_relationship_value("requiresDependency", "edge_key"),
623                source_node_label=source_node,
624            )
625        else:
626            required_dependencies = self.get_adjacent_nodes_by_relationship(
627                node_label=source_node,
628                relationship=self.dmr.get_relationship_value(
629                    "requiresDependency", "edge_key"
630                ),
631            )
632
633        if display_names:
634            # get display names of dependencies
635            dependencies_display_names = []
636
637            for req in required_dependencies:
638                dependencies_display_names.append(
639                    self.graph.nodes[req][
640                        self.dmr.get_relationship_value("displayName", "node_label")
641                    ]
642                )
643
644            return dependencies_display_names
645
646        return required_dependencies

Get the immediate dependencies that are related to a given source node.

Arguments:
  • source_node: The node whose dependencies we need to compute.
  • display_names: if True, return list of display names of each of the dependencies. if False, return list of node labels of each of the dependencies.
  • schema_ordered: if True, return the dependencies of the node following the order of the schema (slower). if False, return dependencies from graph without guaranteeing schema order (faster)
Returns:

List of nodes that are dependent on the source node.

def get_nodes_descendants(self, node_label: str) -> list[str]:
648    def get_nodes_descendants(self, node_label: str) -> list[str]:
649        """Return a list of nodes reachable from source in graph
650        Args:
651            node_label, str: any given node
652        Return:
653            all_descendants, list: nodes reachable from source in graph
654        """
655        all_descendants = list(nx.descendants(self.graph, node_label))
656
657        return all_descendants

Return a list of nodes reachable from source in graph

Arguments:
  • node_label, str: any given node
Return:

all_descendants, list: nodes reachable from source in graph

def get_nodes_display_names(self, node_list: list[str]) -> list[str]:
659    def get_nodes_display_names(
660        self,
661        node_list: list[str],
662    ) -> list[str]:
663        """Get display names associated with the given list of nodes.
664
665        Args:
666            node_list: List of nodes whose display names we need to retrieve.
667
668        Returns:
669            List of display names.
670        """
671        node_list_display_names = [
672            self.graph.nodes[node][
673                self.dmr.get_relationship_value("displayName", "node_label")
674            ]
675            for node in node_list
676        ]
677
678        return node_list_display_names

Get display names associated with the given list of nodes.

Arguments:
  • node_list: List of nodes whose display names we need to retrieve.
Returns:

List of display names.

def get_node_label(self, node_display_name: str) -> str:
680    def get_node_label(self, node_display_name: str) -> str:
681        """Get the node label for a given display name.
682
683        Args:
684            node_display_name: Display name of the node which you want to get the label for.
685        Returns:
686            Node label associated with given node.
687            If display name not part of schema, return an empty string.
688        """
689
690        node_class_label = get_class_label_from_display_name(
691            display_name=node_display_name
692        )
693        node_property_label = get_property_label_from_display_name(
694            display_name=node_display_name
695        )
696
697        if node_class_label in self.graph.nodes:
698            node_label = node_class_label
699        elif node_property_label in self.graph.nodes:
700            node_label = node_property_label
701        else:
702            node_label = ""
703
704        return node_label

Get the node label for a given display name.

Arguments:
  • node_display_name: Display name of the node which you want to get the label for.
Returns:

Node label associated with given node. If display name not part of schema, return an empty string.

def get_node_range( self, node_label: Optional[str] = None, node_display_name: Optional[str] = None, display_names: bool = False) -> list[str]:
706    def get_node_range(
707        self,
708        node_label: Optional[str] = None,
709        node_display_name: Optional[str] = None,
710        display_names: bool = False,
711    ) -> list[str]:
712        """
713        Get the range, i.e., all the valid values that are associated with a node label.
714
715
716        Args:
717            node_label (Optional[str], optional): Node for which you need to retrieve the range.
718              Defaults to None.
719            node_display_name (Optional[str], optional): _description_. Defaults to None.
720            display_names (bool, optional): _description_. Defaults to False.
721
722        Raises:
723            ValueError: If the node cannot be found in the graph.
724
725        Returns:
726            list[str]:
727              If display_names=False, a list of valid values (labels) associated with a given node.
728              If display_names=True, a list of valid values (display names) associated
729                with a given node
730        """
731        node_label = self._get_node_label(node_label, node_display_name)
732        try:
733            # get node range in the order defined in schema for given node
734            required_range = self.find_node_range(node_label=node_label)
735        except KeyError as exc:
736            raise ValueError(
737                f"The source node {node_label} does not exist in the graph. "
738                "Please use a different node."
739            ) from exc
740
741        if display_names:
742            # get the display name(s) of all dependencies
743            dependencies_display_names = []
744
745            for req in required_range:
746                dependencies_display_names.append(self.graph.nodes[req]["displayName"])
747
748            return dependencies_display_names
749
750        return required_range

Get the range, i.e., all the valid values that are associated with a node label.

Arguments:
  • node_label (Optional[str], optional): Node for which you need to retrieve the range. Defaults to None.
  • node_display_name (Optional[str], optional): _description_. Defaults to None.
  • display_names (bool, optional): _description_. Defaults to False.
Raises:
  • ValueError: If the node cannot be found in the graph.
Returns:

list[str]: If display_names=False, a list of valid values (labels) associated with a given node. If display_names=True, a list of valid values (display names) associated with a given node

def get_node_required( self, node_label: Optional[str] = None, node_display_name: Optional[str] = None) -> bool:
752    def get_node_required(
753        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
754    ) -> bool:
755        """Check if a given node is required or not.
756
757        Note: The possible options that a node can be associated with -- "required" / "optional".
758
759        Args:
760            node_label: Label of the node for which you need to look up.
761            node_display_name: Display name of the node for which you want look up.
762        Returns:
763            True: If the given node is a "required" node.
764            False: If the given node is not a "required" (i.e., an "optional") node.
765        """
766        node_label = self._get_node_label(node_label, node_display_name)
767        rel_node_label = self.dmr.get_relationship_value("required", "node_label")
768        node_required = self.graph.nodes[node_label][rel_node_label]
769        return node_required

Check if a given node is required or not.

Note: The possible options that a node can be associated with -- "required" / "optional".

Arguments:
  • node_label: Label of the node for which you need to look up.
  • node_display_name: Display name of the node for which you want look up.
Returns:

True: If the given node is a "required" node. False: If the given node is not a "required" (i.e., an "optional") node.

def get_node_validation_rules( self, node_label: Optional[str] = None, node_display_name: Optional[str] = None) -> Union[list, dict[str, str]]:
771    def get_node_validation_rules(
772        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
773    ) -> Union[list, dict[str, str]]:
774        """Get validation rules associated with a node,
775
776        Args:
777            node_label: Label of the node for which you need to look up.
778            node_display_name: Display name of the node which you want to get the label for.
779        Returns:
780            A set of validation rules associated with node, as a list or a dictionary.
781        """
782        node_label = self._get_node_label(node_label, node_display_name)
783
784        if not node_label:
785            return []
786
787        try:
788            node_validation_rules = self.graph.nodes[node_label]["validationRules"]
789        except KeyError as key_error:
790            raise ValueError(
791                f"{node_label} is not in the graph, please provide a proper node label"
792            ) from key_error
793
794        return node_validation_rules

Get validation rules associated with a node,

Arguments:
  • node_label: Label of the node for which you need to look up.
  • node_display_name: Display name of the node which you want to get the label for.
Returns:

A set of validation rules associated with node, as a list or a dictionary.

def get_subgraph_by_edge_type(self, relationship: str) -> networkx.classes.digraph.DiGraph:
796    def get_subgraph_by_edge_type(self, relationship: str) -> nx.DiGraph:
797        """Get a subgraph containing all edges of a given type (aka relationship).
798
799        Args:
800            relationship: edge / link relationship type with possible values same as in above docs.
801
802        Returns:
803            Directed graph on edges of a particular type (aka relationship)
804        """
805
806        # prune the metadata model graph so as to include only those edges that
807        # match the relationship type
808        rel_edges = []
809        for node_1, node_2, key, _ in self.graph.out_edges(data=True, keys=True):
810            if key == relationship:
811                rel_edges.append((node_1, node_2))
812
813        relationship_subgraph: nx.DiGraph = nx.DiGraph()
814        relationship_subgraph.add_edges_from(rel_edges)
815
816        return relationship_subgraph

Get a subgraph containing all edges of a given type (aka relationship).

Arguments:
  • relationship: edge / link relationship type with possible values same as in above docs.
Returns:

Directed graph on edges of a particular type (aka relationship)

def find_adjacent_child_classes( self, node_label: Optional[str] = None, node_display_name: Optional[str] = None) -> list[str]:
818    def find_adjacent_child_classes(
819        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
820    ) -> list[str]:
821        """Find child classes of a given node.
822        Args:
823            node_display_name: Display name of the node to look up.
824            node_label: Label of the node to look up.
825        Returns:
826            List of nodes that are adjacent to the given node, by SubclassOf relationship.
827        """
828        node_label = self._get_node_label(node_label, node_display_name)
829        return self.get_adjacent_nodes_by_relationship(
830            node_label=node_label,
831            relationship=self.dmr.get_relationship_value("subClassOf", "edge_key"),
832        )

Find child classes of a given node.

Arguments:
  • node_display_name: Display name of the node to look up.
  • node_label: Label of the node to look up.
Returns:

List of nodes that are adjacent to the given node, by SubclassOf relationship.

def find_child_classes(self, schema_class: str) -> list:
834    def find_child_classes(self, schema_class: str) -> list:
835        """Find schema classes that inherit from the given class
836        Args:
837            schema_class: node label for the class to from which to look for children.
838        Returns:
839            list of children to the schema_class.
840        """
841        child_classes = unlist(list(self.graph.successors(schema_class)))
842        assert isinstance(child_classes, list)
843        return child_classes

Find schema classes that inherit from the given class

Arguments:
  • schema_class: node label for the class to from which to look for children.
Returns:

list of children to the schema_class.

def find_class_specific_properties(self, schema_class: str) -> list[str]:
845    def find_class_specific_properties(self, schema_class: str) -> list[str]:
846        """Find properties specifically associated with a given class
847        Args:
848            schema_class, str: node/class label, to identify properties for.
849        Returns:
850            properties, list: List of properties associate with a given schema class.
851        Raises:
852            KeyError: Key error is raised if the provided schema_class is not in the graph
853        """
854
855        if not self.is_class_in_schema(schema_class):
856            raise KeyError(
857                (
858                    f"Schema_class provided: {schema_class} is not in the data model, please check "
859                    "that you are providing the proper class/node label"
860                )
861            )
862
863        properties = []
864        for node1, node2 in self.graph.edges():
865            if (
866                node2 == schema_class
867                and "domainValue" in self.graph[node1][schema_class]
868            ):
869                properties.append(node1)
870        return properties

Find properties specifically associated with a given class

Arguments:
  • schema_class, str: node/class label, to identify properties for.
Returns:

properties, list: List of properties associate with a given schema class.

Raises:
  • KeyError: Key error is raised if the provided schema_class is not in the graph
def find_parent_classes(self, node_label: str) -> list[list[str]]:
872    def find_parent_classes(self, node_label: str) -> list[list[str]]:
873        """Find all parents of the provided node
874        Args:
875            node_label: label of the node to find parents of
876        Returns:
877            List of list of Parents to the given node.
878        """
879        # Get digraph of nodes with parents
880        digraph = self.get_digraph_by_edge_type("parentOf")
881
882        # Get root node
883        root_node = list(nx.topological_sort(digraph))[0]
884
885        # Get paths between root_node and the target node.
886        paths = nx.all_simple_paths(self.graph, source=root_node, target=node_label)
887
888        return [_path[:-1] for _path in paths]

Find all parents of the provided node

Arguments:
  • node_label: label of the node to find parents of
Returns:

List of list of Parents to the given node.

def full_schema_graph(self, size: Optional[int] = None) -> graphviz.graphs.Digraph:
890    def full_schema_graph(self, size: Optional[int] = None) -> graphviz.Digraph:
891        """Create a graph of the data model.
892        Args:
893            size, float: max height and width of the graph, if one value provided
894               it is used for both.
895        Returns:
896            schema graph viz
897        """
898        edges = self.graph.edges()
899        return visualize(edges, size=size)

Create a graph of the data model.

Arguments:
  • size, float: max height and width of the graph, if one value provided it is used for both.
Returns:

schema graph viz

def is_class_in_schema(self, node_label: str) -> bool:
901    def is_class_in_schema(self, node_label: str) -> bool:
902        """Determine if provided node_label is in the schema graph/data model.
903        Args:
904            node_label: label of node to search for in the
905        Returns:
906            True, if node is in the graph schema
907            False, if node is not in graph schema
908        """
909        return node_label in self.graph.nodes()

Determine if provided node_label is in the schema graph/data model.

Arguments:
  • node_label: label of node to search for in the
Returns:

True, if node is in the graph schema False, if node is not in graph schema

def sub_schema_graph( self, source: str, direction: str, size: Optional[float] = None) -> Optional[graphviz.graphs.Digraph]:
911    def sub_schema_graph(
912        self, source: str, direction: str, size: Optional[float] = None
913    ) -> Optional[graphviz.Digraph]:
914        """Create a sub-schema graph
915        Args:
916            source, str: source node label to start graph
917            direction, str: direction to create the visualization, choose from "up", "down", "both"
918            size, float: max height and width of the graph, if one value provided it is used for
919              both.
920        Returns:
921            Sub-schema graph viz
922        """
923        if direction == "down":
924            edges = list(nx.edge_bfs(self.graph, [source]))
925            return visualize(edges, size=size)
926        if direction == "up":
927            paths = self.find_parent_classes(source)
928            edges = []
929            for _path in paths:
930                _path.append(source)
931                for i in range(0, len(_path) - 1):
932                    edges.append((_path[i], _path[i + 1]))
933            return visualize(edges, size=size)
934        if direction == "both":
935            paths = self.find_parent_classes(source)
936            edges = list(nx.edge_bfs(self.graph, [source]))
937            for _path in paths:
938                _path.append(source)
939                for i in range(0, len(_path) - 1):
940                    edges.append((_path[i], _path[i + 1]))
941            return visualize(edges, size=size)
942        return None

Create a sub-schema graph

Arguments:
  • source, str: source node label to start graph
  • direction, str: direction to create the visualization, choose from "up", "down", "both"
  • size, float: max height and width of the graph, if one value provided it is used for both.
Returns:

Sub-schema graph viz

def get_node_column_type( self, node_label: Optional[str] = None, node_display_name: Optional[str] = None) -> Optional[schematic.schemas.constants.JSONSchemaType]:
944    def get_node_column_type(
945        self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
946    ) -> Optional[JSONSchemaType]:
947        """Gets the column type of the node
948
949        Args:
950            node_label: The label of the node to get the type from
951            node_display_name: The display name of the node to get the type from
952
953        Returns:
954            The column type of the node if it has one, otherwise None
955        """
956        node_label = self._get_node_label(node_label, node_display_name)
957        rel_node_label = self.dmr.get_relationship_value("columnType", "node_label")
958        type_string = self.graph.nodes[node_label][rel_node_label]
959        if type_string is None:
960            return type_string
961        return JSONSchemaType(type_string)

Gets the column type of the node

Arguments:
  • node_label: The label of the node to get the type from
  • node_display_name: The display name of the node to get the type from
Returns:

The column type of the node if it has one, otherwise None

def create_data_model_graph_explorer( data_model_path: str) -> DataModelGraphExplorer:
985def create_data_model_graph_explorer(data_model_path: str) -> DataModelGraphExplorer:
986    """Creates a DataModelGraphExplore using a data model
987
988    Args:
989        data_model_path: The path to a data model to create the dmge
990
991    Returns:
992        DataModelGraphExplorer: A dmge created using the input data model
993    """
994    data_model_parser = DataModelParser(path_to_data_model=data_model_path)
995    parsed_data_model = data_model_parser.parse_model()
996    data_model_grapher = DataModelGraph(parsed_data_model)
997    graph_data_model = data_model_grapher.graph
998    return DataModelGraphExplorer(graph_data_model)

Creates a DataModelGraphExplore using a data model

Arguments:
  • data_model_path: The path to a data model to create the dmge
Returns:

DataModelGraphExplorer: A dmge created using the input data model