Source code for feast_spark.constants

from typing import Optional

from feast.constants import ConfigMeta


[docs]class ConfigOptions(metaclass=ConfigMeta):
    #: Default Feast Job Service URL
    JOB_SERVICE_URL: Optional[str] = None

    #: Enable or disable TLS/SSL to Feast Job Service
    JOB_SERVICE_ENABLE_SSL: str = "False"

    #: Path to certificate(s) to secure connection to Feast Job Service
    JOB_SERVICE_SERVER_SSL_CERT: str = ""

    #: Enable or disable control loop for Feast Job Service
    JOB_SERVICE_ENABLE_CONTROL_LOOP: str = "False"

    #: If set to True, Control Loop will try to start failed streaming jobss
    JOB_SERVICE_RETRY_FAILED_JOBS: str = "False"

    #: Pause in seconds between starting new jobs in Control Loop
    JOB_SERVICE_PAUSE_BETWEEN_JOBS: str = "5"

    #: Port for which Prometheus metric server will be running on
    JOB_SERVICE_PROMETHEUS_METRIC_PORT: int = 8080

    #: Endpoint URL for S3 storage_client
    S3_ENDPOINT_URL: Optional[str] = None

    #: Spark Job launcher. The choice of storage is connected to the choice of SPARK_LAUNCHER.
    #:
    #: Options: "standalone", "dataproc", "emr"
    SPARK_LAUNCHER: Optional[str] = None

    #: Feast Spark Job ingestion jobs staging location. The choice of storage is connected to the choice of SPARK_LAUNCHER.
    #:
    #: Eg. gs://some-bucket/output/, s3://some-bucket/output/, file:///data/subfolder/
    SPARK_STAGING_LOCATION: Optional[str] = None

    #: Feast Spark Job ingestion jar file. The choice of storage is connected to the choice of SPARK_LAUNCHER.
    #:
    #: Eg. "dataproc" (http and gs), "emr" (http and s3), "standalone" (http and file)
    SPARK_INGESTION_JAR: str = "https://storage.googleapis.com/feast-jobs/spark/ingestion/feast-ingestion-spark-develop.jar"

    #: Spark resource manager master url
    SPARK_STANDALONE_MASTER: str = "local[*]"

    #: Directory where Spark is installed
    SPARK_HOME: Optional[str] = None

    #: The project id where the materialized view of BigQuerySource is going to be created
    #: by default, use the same project where view is located
    SPARK_BQ_MATERIALIZATION_PROJECT: Optional[str] = None

    #: The dataset id where the materialized view of BigQuerySource is going to be created
    #: by default, use the same dataset where view is located
    SPARK_BQ_MATERIALIZATION_DATASET: Optional[str] = None

    #: Dataproc cluster to run Feast Spark Jobs in
    DATAPROC_CLUSTER_NAME: Optional[str] = None

    #: Project of Dataproc cluster
    DATAPROC_PROJECT: Optional[str] = None

    #: Region of Dataproc cluster
    DATAPROC_REGION: Optional[str] = None

    #: No. of executor instances for Dataproc cluster
    DATAPROC_EXECUTOR_INSTANCES = "2"

    #: No. of executor cores for Dataproc cluster
    DATAPROC_EXECUTOR_CORES = "2"

    #: No. of executor memory for Dataproc cluster
    DATAPROC_EXECUTOR_MEMORY = "2g"

    # namespace to use for Spark jobs launched using k8s spark operator
    SPARK_K8S_NAMESPACE = "default"

    # expect k8s spark operator to be running in the same cluster as Feast
    SPARK_K8S_USE_INCLUSTER_CONFIG = "True"

    # SparkApplication resource template
    SPARK_K8S_JOB_TEMPLATE_PATH = None

    # SparkApplication resource template for Batch Ingestion Jobs
    SPARK_K8S_BATCH_INGESTION_TEMPLATE_PATH: Optional[str] = ""

    # SparkApplication resource template for Stream Ingestion Jobs
    SPARK_K8S_STREAM_INGESTION_TEMPLATE_PATH: Optional[str] = ""

    # SparkApplication resource template for Historical Retrieval Jobs
    SPARK_K8S_HISTORICAL_RETRIEVAL_TEMPLATE_PATH: Optional[str] = ""

    #: Default Redis host to Redis Instance which stores Spark Ingestion Job metrics
    SPARK_METRICS_REDIS_HOST: Optional[str] = None

    #: Default Redis port to Redis Instance which stores Spark Ingestion Job metrics
    SPARK_METRICS_REDIS_PORT: Optional[str] = None

    #: Host to Redis Instance which stores locks for job management
    LOCK_MGR_REDIS_HOST: Optional[str] = None

    #: Port to Redis Instance which stores locks for job management
    LOCK_MGR_REDIS_PORT: Optional[str] = None

    #: TTL for locks for job management
    LOCK_EXPIRY: Optional[str] = "60"

    #: File format of historical retrieval features
    HISTORICAL_FEATURE_OUTPUT_FORMAT: str = "parquet"

    #: File location of historical retrieval features
    HISTORICAL_FEATURE_OUTPUT_LOCATION: Optional[str] = None

    #: Default Redis host
    REDIS_HOST: Optional[str] = ""

    #: Default Redis port
    REDIS_PORT: Optional[str] = ""

    #: Redis credentials
    REDIS_PASSWORD: Optional[str] = ""

    #: Enable or disable TLS/SSL to Redis
    REDIS_SSL: Optional[str] = "False"

    #: BigTable Project ID
    BIGTABLE_PROJECT: Optional[str] = ""

    #: BigTable Instance ID
    BIGTABLE_INSTANCE: Optional[str] = ""

    #: Cassandra host. Can be a comma separated string
    CASSANDRA_HOST: Optional[str] = ""

    #: Cassandra port
    CASSANDRA_PORT: Optional[str] = ""

    #: Enable or disable StatsD
    STATSD_ENABLED: str = "False"

    #: Default StatsD port
    STATSD_HOST: Optional[str] = None

    #: Default StatsD port
    STATSD_PORT: Optional[str] = None

    #: Ingestion Job DeadLetter Destination. The choice of storage is connected to the choice of SPARK_LAUNCHER.
    #:
    #: Eg. gs://some-bucket/output/, s3://some-bucket/output/, file:///data/subfolder/
    DEADLETTER_PATH: str = ""

    #: Ingestion Job Checkpoint Location. Format same as for DeadLetter path
    CHECKPOINT_PATH: str = ""

    #: ProtoRegistry Address (currently only Stencil Server is supported as registry)
    #: https://github.com/odpf/stencil
    STENCIL_URL: str = ""

    #: Bearer token used for authentication with Stencil Server
    STENCIL_TOKEN: str = ""

    #: If set to true rows that do not pass custom validation (see feast.contrib.validation)
    #: won't be saved to Online Storage
    INGESTION_DROP_INVALID_ROWS: str = "False"

    #: EMR cluster to run Feast Spark Jobs in
    EMR_CLUSTER_ID: Optional[str] = None

    #: Region of EMR cluster
    EMR_REGION: Optional[str] = None

    #: Template path of EMR cluster
    EMR_CLUSTER_TEMPLATE_PATH: Optional[str] = None

    #: Log path of EMR cluster
    EMR_LOG_LOCATION: Optional[str] = None

    #: Whitelisted Feast Job Types
    WHITELISTED_JOB_TYPES: Optional[str] = None

    #: Whitelisted Feast projects
    WHITELISTED_PROJECTS: Optional[str] = None

    #: File path to a whitelist containing all the feature tables allowed for ingestion.
    #: Each line in the file should be in the format of <project>:<feature table>
    WHITELISTED_FEATURE_TABLES_PATH: Optional[str] = None

    #: If set - streaming ingestion job will be consuming incoming rows not continuously,
    #: but periodically with configured interval (in seconds).
    #: That may help to control amount of write requests to storage
    SPARK_STREAMING_TRIGGERING_INTERVAL: Optional[str] = None

    #: GCP project of the BigQuery dataset used to stage the entities during historical
    #: feature retrieval. If not set, the GCP project of the feature table batch source
    #: will be used instead.
    BQ_STAGING_PROJECT: Optional[str] = None

    #: BigQuery dataset used to stage the entities during historical feature retrieval.
    #  If not set, the BigQuery dataset of the batch source will be used
    #: instead.
    BQ_STAGING_DATASET: Optional[str] = None

[docs]    def defaults(self):
        return {
            k: getattr(self, k)
            for k in self.__config_keys__
            if getattr(self, k) is not None
        }