From ec062b6787dbf3a4285cc571014b939e778c0fa3 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Mon, 7 Feb 2022 14:29:51 -0800 Subject: [PATCH] feat(glue): make ownership configurable in glue source (#4078) --- metadata-ingestion/source_docs/glue.md | 1 + .../src/datahub/ingestion/source/aws/glue.py | 20 ++++++++++++------- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/source_docs/glue.md b/metadata-ingestion/source_docs/glue.md index a4ddedfb50be8..27f5074f9775f 100644 --- a/metadata-ingestion/source_docs/glue.md +++ b/metadata-ingestion/source_docs/glue.md @@ -92,6 +92,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe. | `ignore_unsupported_connectors` | | `True` | Whether to ignore unsupported connectors. If disabled, an error will be raised. | | `emit_s3_lineage` | | `True` | Whether to emit S3-to-Glue lineage. | | `glue_s3_lineage_direction` | | `upstream` | If `upstream`, S3 is upstream to Glue. If `downstream` S3 is downstream to Glue. | +| `extract_owners` | | `True` | When enabled, extracts ownership from Glue directly and overwrites existing owners. When disabled, ownership is left empty for datasets. | ## Compatibility diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index d91a5f2893408..1491992469983 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -48,6 +48,7 @@ class GlueSourceConfig(AwsSourceConfig): + extract_owners: Optional[bool] = True extract_transforms: Optional[bool] = True underlying_platform: Optional[str] = None ignore_unsupported_connectors: Optional[bool] = True @@ -89,6 +90,7 @@ class GlueSource(Source): def __init__(self, config: GlueSourceConfig, ctx: PipelineContext): super().__init__(ctx) + self.extract_owners = config.extract_owners self.source_config = config self.report = GlueSourceReport() self.glue_client = config.glue_client @@ -612,7 +614,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: yield dataset_wu def _extract_record(self, table: Dict, table_name: str) -> MetadataChangeEvent: - def get_owner() -> OwnershipClass: + def get_owner() -> Optional[OwnershipClass]: owner = table.get("Owner") if owner: owners = [ @@ -621,11 +623,10 @@ def get_owner() -> OwnershipClass: type=OwnershipTypeClass.DATAOWNER, ) ] - else: - owners = [] - return OwnershipClass( - owners=owners, - ) + return OwnershipClass( + owners=owners, + ) + return None def get_dataset_properties() -> DatasetPropertiesClass: return DatasetPropertiesClass( @@ -680,7 +681,12 @@ def get_schema_metadata(glue_source: GlueSource) -> SchemaMetadata: ) dataset_snapshot.aspects.append(Status(removed=False)) - dataset_snapshot.aspects.append(get_owner()) + + if self.extract_owners: + optional_owner_aspect = get_owner() + if optional_owner_aspect is not None: + dataset_snapshot.aspects.append(optional_owner_aspect) + dataset_snapshot.aspects.append(get_dataset_properties()) dataset_snapshot.aspects.append(get_schema_metadata(self))