Skip to main content

DataFlow

Aspects

dataFlowKey

Key for a Data Flow

Schema
{
"type": "record",
"Aspect": {
"name": "dataFlowKey"
},
"name": "DataFlowKey",
"namespace": "com.linkedin.metadata.key",
"fields": [
{
"Searchable": {
"fieldType": "TEXT_PARTIAL"
},
"type": "string",
"name": "orchestrator",
"doc": "Workflow manager like azkaban, airflow which orchestrates the flow"
},
{
"Searchable": {
"enableAutocomplete": true,
"fieldType": "TEXT_PARTIAL"
},
"type": "string",
"name": "flowId",
"doc": "Unique Identifier of the data flow"
},
{
"Searchable": {
"fieldType": "TEXT_PARTIAL"
},
"type": "string",
"name": "cluster",
"doc": "Cluster where the flow is executed"
}
],
"doc": "Key for a Data Flow"
}

dataFlowInfo

Information about a Data processing flow

Schema
{
"type": "record",
"Aspect": {
"name": "dataFlowInfo"
},
"name": "DataFlowInfo",
"namespace": "com.linkedin.datajob",
"fields": [
{
"Searchable": {
"/*": {
"queryByDefault": true
}
},
"type": {
"type": "map",
"values": "string"
},
"name": "customProperties",
"default": {},
"doc": "Custom property bag."
},
{
"java": {
"class": "com.linkedin.common.url.Url",
"coercerClass": "com.linkedin.common.url.UrlCoercer"
},
"type": [
"null",
"string"
],
"name": "externalUrl",
"default": null,
"doc": "URL where the reference exist"
},
{
"Searchable": {
"boostScore": 10.0,
"enableAutocomplete": true,
"fieldType": "TEXT_PARTIAL"
},
"type": "string",
"name": "name",
"doc": "Flow name"
},
{
"Searchable": {
"fieldType": "TEXT",
"hasValuesFieldName": "hasDescription"
},
"type": [
"null",
"string"
],
"name": "description",
"default": null,
"doc": "Flow description"
},
{
"Searchable": {
"fieldType": "TEXT_PARTIAL",
"queryByDefault": false
},
"type": [
"null",
"string"
],
"name": "project",
"default": null,
"doc": "Optional project/namespace associated with the flow"
},
{
"Searchable": {
"/time": {
"fieldName": "createdAt",
"fieldType": "DATETIME"
}
},
"type": [
"null",
{
"type": "record",
"name": "TimeStamp",
"namespace": "com.linkedin.common",
"fields": [
{
"type": "long",
"name": "time",
"doc": "When did the event occur"
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": [
"null",
"string"
],
"name": "actor",
"default": null,
"doc": "Optional: The actor urn involved in the event."
}
],
"doc": "A standard event timestamp"
}
],
"name": "created",
"default": null,
"doc": "A timestamp documenting when the asset was created in the source Data Platform (not on DataHub)"
},
{
"Searchable": {
"/time": {
"fieldName": "lastModifiedAt",
"fieldType": "DATETIME"
}
},
"type": [
"null",
"com.linkedin.common.TimeStamp"
],
"name": "lastModified",
"default": null,
"doc": "A timestamp documenting when the asset was last modified in the source Data Platform (not on DataHub)"
}
],
"doc": "Information about a Data processing flow"
}

editableDataFlowProperties

Stores editable changes made to properties. This separates changes made from ingestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines

Schema
{
"type": "record",
"Aspect": {
"name": "editableDataFlowProperties"
},
"name": "EditableDataFlowProperties",
"namespace": "com.linkedin.datajob",
"fields": [
{
"type": {
"type": "record",
"name": "AuditStamp",
"namespace": "com.linkedin.common",
"fields": [
{
"type": "long",
"name": "time",
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": "string",
"name": "actor",
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": [
"null",
"string"
],
"name": "impersonator",
"default": null,
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor."
},
{
"type": [
"null",
"string"
],
"name": "message",
"default": null,
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
}
],
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
},
"name": "created",
"default": {
"actor": "urn:li:corpuser:unknown",
"impersonator": null,
"time": 0,
"message": null
},
"doc": "An AuditStamp corresponding to the creation of this resource/association/sub-resource. A value of 0 for time indicates missing data."
},
{
"type": "com.linkedin.common.AuditStamp",
"name": "lastModified",
"default": {
"actor": "urn:li:corpuser:unknown",
"impersonator": null,
"time": 0,
"message": null
},
"doc": "An AuditStamp corresponding to the last modification of this resource/association/sub-resource. If no modification has happened since creation, lastModified should be the same as created. A value of 0 for time indicates missing data."
},
{
"type": [
"null",
"com.linkedin.common.AuditStamp"
],
"name": "deleted",
"default": null,
"doc": "An AuditStamp corresponding to the deletion of this resource/association/sub-resource. Logically, deleted MUST have a later timestamp than creation. It may or may not have the same time as lastModified depending upon the resource/association/sub-resource semantics."
},
{
"Searchable": {
"fieldName": "editedDescription",
"fieldType": "TEXT"
},
"type": [
"null",
"string"
],
"name": "description",
"default": null,
"doc": "Edited documentation of the data flow"
}
],
"doc": "Stores editable changes made to properties. This separates changes made from\ningestion pipelines and edits in the UI to avoid accidental overwrites of user-provided data by ingestion pipelines"
}

ownership

Ownership information of an entity.

Schema
{
"type": "record",
"Aspect": {
"name": "ownership"
},
"name": "Ownership",
"namespace": "com.linkedin.common",
"fields": [
{
"type": {
"type": "array",
"items": {
"type": "record",
"name": "Owner",
"namespace": "com.linkedin.common",
"fields": [
{
"Relationship": {
"entityTypes": [
"corpuser",
"corpGroup"
],
"name": "OwnedBy"
},
"Searchable": {
"addToFilters": true,
"fieldName": "owners",
"fieldType": "URN",
"filterNameOverride": "Owned By",
"hasValuesFieldName": "hasOwners",
"queryByDefault": false
},
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": "string",
"name": "owner",
"doc": "Owner URN, e.g. urn:li:corpuser:ldap, urn:li:corpGroup:group_name, and urn:li:multiProduct:mp_name\n(Caveat: only corpuser is currently supported in the frontend.)"
},
{
"type": {
"type": "enum",
"symbolDocs": {
"BUSINESS_OWNER": "A person or group who is responsible for logical, or business related, aspects of the asset.",
"CONSUMER": "A person, group, or service that consumes the data\nDeprecated! Use TECHNICAL_OWNER or BUSINESS_OWNER instead.",
"DATAOWNER": "A person or group that is owning the data\nDeprecated! Use TECHNICAL_OWNER instead.",
"DATA_STEWARD": "A steward, expert, or delegate responsible for the asset.",
"DELEGATE": "A person or a group that overseas the operation, e.g. a DBA or SRE.\nDeprecated! Use TECHNICAL_OWNER instead.",
"DEVELOPER": "A person or group that is in charge of developing the code\nDeprecated! Use TECHNICAL_OWNER instead.",
"NONE": "No specific type associated to the owner.",
"PRODUCER": "A person, group, or service that produces/generates the data\nDeprecated! Use TECHNICAL_OWNER instead.",
"STAKEHOLDER": "A person or a group that has direct business interest\nDeprecated! Use TECHNICAL_OWNER, BUSINESS_OWNER, or STEWARD instead.",
"TECHNICAL_OWNER": "person or group who is responsible for technical aspects of the asset."
},
"deprecatedSymbols": {
"CONSUMER": true,
"DATAOWNER": true,
"DELEGATE": true,
"DEVELOPER": true,
"PRODUCER": true,
"STAKEHOLDER": true
},
"name": "OwnershipType",
"namespace": "com.linkedin.common",
"symbols": [
"TECHNICAL_OWNER",
"BUSINESS_OWNER",
"DATA_STEWARD",
"NONE",
"DEVELOPER",
"DATAOWNER",
"DELEGATE",
"PRODUCER",
"CONSUMER",
"STAKEHOLDER"
],
"doc": "Asset owner types"
},
"name": "type",
"doc": "The type of the ownership"
},
{
"type": [
"null",
{
"type": "record",
"name": "OwnershipSource",
"namespace": "com.linkedin.common",
"fields": [
{
"type": {
"type": "enum",
"symbolDocs": {
"AUDIT": "Auditing system or audit logs",
"DATABASE": "Database, e.g. GRANTS table",
"FILE_SYSTEM": "File system, e.g. file/directory owner",
"ISSUE_TRACKING_SYSTEM": "Issue tracking system, e.g. Jira",
"MANUAL": "Manually provided by a user",
"OTHER": "Other sources",
"SERVICE": "Other ownership-like service, e.g. Nuage, ACL service etc",
"SOURCE_CONTROL": "SCM system, e.g. GIT, SVN"
},
"name": "OwnershipSourceType",
"namespace": "com.linkedin.common",
"symbols": [
"AUDIT",
"DATABASE",
"FILE_SYSTEM",
"ISSUE_TRACKING_SYSTEM",
"MANUAL",
"SERVICE",
"SOURCE_CONTROL",
"OTHER"
]
},
"name": "type",
"doc": "The type of the source"
},
{
"type": [
"null",
"string"
],
"name": "url",
"default": null,
"doc": "A reference URL for the source"
}
],
"doc": "Source/provider of the ownership information"
}
],
"name": "source",
"default": null,
"doc": "Source information for the ownership"
}
],
"doc": "Ownership information"
}
},
"name": "owners",
"doc": "List of owners of the entity."
},
{
"type": {
"type": "record",
"name": "AuditStamp",
"namespace": "com.linkedin.common",
"fields": [
{
"type": "long",
"name": "time",
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": "string",
"name": "actor",
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": [
"null",
"string"
],
"name": "impersonator",
"default": null,
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor."
},
{
"type": [
"null",
"string"
],
"name": "message",
"default": null,
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
}
],
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
},
"name": "lastModified",
"default": {
"actor": "urn:li:corpuser:unknown",
"impersonator": null,
"time": 0,
"message": null
},
"doc": "Audit stamp containing who last modified the record and when. A value of 0 in the time field indicates missing data."
}
],
"doc": "Ownership information of an entity."
}

status

The lifecycle status metadata of an entity, e.g. dataset, metric, feature, etc. This aspect is used to represent soft deletes conventionally.

Schema
{
"type": "record",
"Aspect": {
"name": "status"
},
"name": "Status",
"namespace": "com.linkedin.common",
"fields": [
{
"Searchable": {
"fieldType": "BOOLEAN"
},
"type": "boolean",
"name": "removed",
"default": false,
"doc": "Whether the entity has been removed (soft-deleted)."
}
],
"doc": "The lifecycle status metadata of an entity, e.g. dataset, metric, feature, etc.\nThis aspect is used to represent soft deletes conventionally."
}

globalTags

Tag aspect used for applying tags to an entity

Schema
{
"type": "record",
"Aspect": {
"name": "globalTags"
},
"name": "GlobalTags",
"namespace": "com.linkedin.common",
"fields": [
{
"Relationship": {
"/*/tag": {
"entityTypes": [
"tag"
],
"name": "TaggedWith"
}
},
"Searchable": {
"/*/tag": {
"addToFilters": true,
"boostScore": 0.5,
"fieldName": "tags",
"fieldType": "URN",
"filterNameOverride": "Tag",
"hasValuesFieldName": "hasTags",
"queryByDefault": true
}
},
"type": {
"type": "array",
"items": {
"type": "record",
"name": "TagAssociation",
"namespace": "com.linkedin.common",
"fields": [
{
"java": {
"class": "com.linkedin.common.urn.TagUrn"
},
"type": "string",
"name": "tag",
"doc": "Urn of the applied tag"
},
{
"type": [
"null",
"string"
],
"name": "context",
"default": null,
"doc": "Additional context about the association"
}
],
"doc": "Properties of an applied tag. For now, just an Urn. In the future we can extend this with other properties, e.g.\npropagation parameters."
}
},
"name": "tags",
"doc": "Tags associated with a given entity"
}
],
"doc": "Tag aspect used for applying tags to an entity"
}

browsePaths

Shared aspect containing Browse Paths to be indexed for an entity.

Schema
{
"type": "record",
"Aspect": {
"name": "browsePaths"
},
"name": "BrowsePaths",
"namespace": "com.linkedin.common",
"fields": [
{
"Searchable": {
"/*": {
"fieldName": "browsePaths",
"fieldType": "BROWSE_PATH"
}
},
"type": {
"type": "array",
"items": "string"
},
"name": "paths",
"doc": "A list of valid browse paths for the entity.\n\nBrowse paths are expected to be forward slash-separated strings. For example: 'prod/snowflake/datasetName'"
}
],
"doc": "Shared aspect containing Browse Paths to be indexed for an entity."
}

glossaryTerms

Related business terms information

Schema
{
"type": "record",
"Aspect": {
"name": "glossaryTerms"
},
"name": "GlossaryTerms",
"namespace": "com.linkedin.common",
"fields": [
{
"type": {
"type": "array",
"items": {
"type": "record",
"name": "GlossaryTermAssociation",
"namespace": "com.linkedin.common",
"fields": [
{
"Relationship": {
"entityTypes": [
"glossaryTerm"
],
"name": "TermedWith"
},
"Searchable": {
"addToFilters": true,
"fieldName": "glossaryTerms",
"fieldType": "URN",
"filterNameOverride": "Glossary Term",
"hasValuesFieldName": "hasGlossaryTerms"
},
"java": {
"class": "com.linkedin.common.urn.GlossaryTermUrn"
},
"type": "string",
"name": "urn",
"doc": "Urn of the applied glossary term"
},
{
"type": [
"null",
"string"
],
"name": "context",
"default": null,
"doc": "Additional context about the association"
}
],
"doc": "Properties of an applied glossary term."
}
},
"name": "terms",
"doc": "The related business terms"
},
{
"type": {
"type": "record",
"name": "AuditStamp",
"namespace": "com.linkedin.common",
"fields": [
{
"type": "long",
"name": "time",
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": "string",
"name": "actor",
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": [
"null",
"string"
],
"name": "impersonator",
"default": null,
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor."
},
{
"type": [
"null",
"string"
],
"name": "message",
"default": null,
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
}
],
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
},
"name": "auditStamp",
"doc": "Audit stamp containing who reported the related business term"
}
],
"doc": "Related business terms information"
}

institutionalMemory

Institutional memory of an entity. This is a way to link to relevant documentation and provide description of the documentation. Institutional or tribal knowledge is very important for users to leverage the entity.

Schema
{
"type": "record",
"Aspect": {
"name": "institutionalMemory"
},
"name": "InstitutionalMemory",
"namespace": "com.linkedin.common",
"fields": [
{
"type": {
"type": "array",
"items": {
"type": "record",
"name": "InstitutionalMemoryMetadata",
"namespace": "com.linkedin.common",
"fields": [
{
"java": {
"class": "com.linkedin.common.url.Url",
"coercerClass": "com.linkedin.common.url.UrlCoercer"
},
"type": "string",
"name": "url",
"doc": "Link to an engineering design document or a wiki page."
},
{
"type": "string",
"name": "description",
"doc": "Description of the link."
},
{
"type": {
"type": "record",
"name": "AuditStamp",
"namespace": "com.linkedin.common",
"fields": [
{
"type": "long",
"name": "time",
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": "string",
"name": "actor",
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": [
"null",
"string"
],
"name": "impersonator",
"default": null,
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor."
},
{
"type": [
"null",
"string"
],
"name": "message",
"default": null,
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
}
],
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
},
"name": "createStamp",
"doc": "Audit stamp associated with creation of this record"
}
],
"doc": "Metadata corresponding to a record of institutional memory."
}
},
"name": "elements",
"doc": "List of records that represent institutional memory of an entity. Each record consists of a link, description, creator and timestamps associated with that record."
}
],
"doc": "Institutional memory of an entity. This is a way to link to relevant documentation and provide description of the documentation. Institutional or tribal knowledge is very important for users to leverage the entity."
}

dataPlatformInstance

The specific instance of the data platform that this entity belongs to

Schema
{
"type": "record",
"Aspect": {
"name": "dataPlatformInstance"
},
"name": "DataPlatformInstance",
"namespace": "com.linkedin.common",
"fields": [
{
"Searchable": {
"addToFilters": true,
"fieldType": "URN",
"filterNameOverride": "Platform"
},
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": "string",
"name": "platform",
"doc": "Data Platform"
},
{
"Searchable": {
"addToFilters": true,
"fieldName": "platformInstance",
"fieldType": "URN",
"filterNameOverride": "Platform Instance"
},
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": [
"null",
"string"
],
"name": "instance",
"default": null,
"doc": "Instance of the data platform (e.g. db instance)"
}
],
"doc": "The specific instance of the data platform that this entity belongs to"
}

domains

Links from an Asset to its Domains

Schema
{
"type": "record",
"Aspect": {
"name": "domains"
},
"name": "Domains",
"namespace": "com.linkedin.domain",
"fields": [
{
"Relationship": {
"/*": {
"entityTypes": [
"domain"
],
"name": "AssociatedWith"
}
},
"Searchable": {
"/*": {
"addToFilters": true,
"fieldName": "domains",
"fieldType": "URN",
"filterNameOverride": "Domain",
"hasValuesFieldName": "hasDomain"
}
},
"type": {
"type": "array",
"items": "string"
},
"name": "domains",
"doc": "The Domains attached to an Asset"
}
],
"doc": "Links from an Asset to its Domains"
}

deprecation

Deprecation status of an entity

Schema
{
"type": "record",
"Aspect": {
"name": "deprecation"
},
"name": "Deprecation",
"namespace": "com.linkedin.common",
"fields": [
{
"Searchable": {
"fieldType": "BOOLEAN",
"weightsPerFieldValue": {
"true": 0.5
}
},
"type": "boolean",
"name": "deprecated",
"doc": "Whether the entity is deprecated."
},
{
"type": [
"null",
"long"
],
"name": "decommissionTime",
"default": null,
"doc": "The time user plan to decommission this entity."
},
{
"type": "string",
"name": "note",
"doc": "Additional information about the entity deprecation plan, such as the wiki, doc, RB."
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": "string",
"name": "actor",
"doc": "The user URN which will be credited for modifying this deprecation content."
}
],
"doc": "Deprecation status of an entity"
}

versionInfo

Information about a Data processing job

Schema
{
"type": "record",
"Aspect": {
"name": "versionInfo"
},
"name": "VersionInfo",
"namespace": "com.linkedin.datajob",
"fields": [
{
"Searchable": {
"/*": {
"queryByDefault": true
}
},
"type": {
"type": "map",
"values": "string"
},
"name": "customProperties",
"default": {},
"doc": "Custom property bag."
},
{
"java": {
"class": "com.linkedin.common.url.Url",
"coercerClass": "com.linkedin.common.url.UrlCoercer"
},
"type": [
"null",
"string"
],
"name": "externalUrl",
"default": null,
"doc": "URL where the reference exist"
},
{
"type": "string",
"name": "version",
"doc": "The version which can indentify a job version like a commit hash or md5 hash"
},
{
"type": "string",
"name": "versionType",
"doc": "The type of the version like git hash or md5 hash"
}
],
"doc": "Information about a Data processing job"
}

containerPath

Shared aspect containing Container Paths to be indexed for an entity.

Schema
{
"type": "record",
"Aspect": {
"name": "containerPath"
},
"name": "ContainerPath",
"namespace": "com.linkedin.common",
"fields": [
{
"type": {
"type": "array",
"items": {
"type": "record",
"name": "ContainerPathEntry",
"namespace": "com.linkedin.common",
"fields": [
{
"type": "string",
"name": "id",
"doc": "The ID of the container path entry. This is what gets stored in the index after URL encoding.\nIf there's an urn associated with this entry, id and urn will be the same"
},
{
"java": {
"class": "com.linkedin.common.urn.Urn"
},
"type": [
"null",
"string"
],
"name": "urn",
"default": null,
"doc": "Optional urn pointing to some entity in DataHub"
}
],
"doc": "Represents a single level in an entity's container path"
}
},
"name": "path",
"doc": "A valid container path for the entity. This field is provided by DataHub by default.\n\nContainer paths are stored in elasticsearch as slash-separated strings and only include platform specific folders or containers.\nThese paths should not include high level info captured elsewhere ie. Platform and Environment."
}
],
"doc": "Shared aspect containing Container Paths to be indexed for an entity."
}

Relationships

Outgoing

These are the relationships stored in this entity's aspects

  • OwnedBy

    • Corpuser via ownership.owners.owner
    • CorpGroup via ownership.owners.owner
  • TaggedWith

    • Tag via globalTags.tags
  • TermedWith

    • GlossaryTerm via glossaryTerms.terms.urn
  • AssociatedWith

    • Domain via domains.domains

Incoming

These are the relationships stored in other entity's aspects

  • IsPartOf

    • DataJob via dataJobKey.flow
  • InstanceOf

    • DataProcessInstance via dataProcessInstanceRelationships.parentTemplate

Global Metadata Model

Global Graph