Source code for feast_spark.pyspark.launchers.standalone.local

import os
import socket
import subprocess
import threading
import uuid
from contextlib import closing
from datetime import datetime
from typing import Dict, List, Optional

import requests
from requests.exceptions import RequestException

from feast_spark.pyspark.abc import (
    BQ_SPARK_PACKAGE,
    BatchIngestionJob,
    BatchIngestionJobParameters,
    JobLauncher,
    RetrievalJob,
    RetrievalJobParameters,
    ScheduledBatchIngestionJobParameters,
    SparkJob,
    SparkJobFailure,
    SparkJobParameters,
    SparkJobStatus,
    StreamIngestionJob,
    StreamIngestionJobParameters,
)


[docs]class JobCache: """ A *global* in-memory cache of Spark jobs. This is necessary since we can't easily keep track of running Spark jobs in local mode, since there is no external state (unlike EMR and Dataproc which keep track of the running jobs for us). """ # Map of job_id -> spark job job_by_id: Dict[str, SparkJob] # Map of job_id -> job_hash. The value can be None, indicating this job was # manually created and Job Service isn't maintaining the state of this job hash_by_id: Dict[str, Optional[str]] # This reentrant lock is necessary for multi-threading access lock: threading.RLock def __init__(self): self.job_by_id = {} self.hash_by_id = {} self.lock = threading.RLock()
[docs] def add_job(self, job: SparkJob) -> None: """Add a Spark job to the cache. Args: job (SparkJob): The new Spark job to add. """ with self.lock: self.job_by_id[job.get_id()] = job if isinstance(job, StreamIngestionJob): self.hash_by_id[job.get_id()] = job.get_hash()
[docs] def list_jobs(self) -> List[SparkJob]: """List all Spark jobs in the cache.""" with self.lock: return list(self.job_by_id.values())
[docs] def get_job_by_id(self, job_id: str) -> SparkJob: """Get a Spark job with the given ID. Throws an exception if such job doesn't exist. Args: job_id (str): External ID of the Spark job to get. Returns: SparkJob: The Spark job with the given ID. """ with self.lock: return self.job_by_id[job_id]
global_job_cache = JobCache()
[docs]def reset_job_cache(): global global_job_cache global_job_cache = JobCache()
def _find_free_port(): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(("", 0)) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) return s.getsockname()[1]
[docs]class StandaloneClusterJobMixin: def __init__( self, job_id: str, job_name: str, process: subprocess.Popen, ui_port: int = None ): self._job_id = job_id self._job_name = job_name self._process = process self._ui_port = ui_port self._start_time = datetime.utcnow()
[docs] def get_id(self) -> str: return self._job_id
[docs] def check_if_started(self): if not self._ui_port: return True try: applications = requests.get( f"http://localhost:{self._ui_port}/api/v1/applications" ).json() except RequestException: return False app = next( iter(app for app in applications if app["name"] == self._job_name), None ) if not app: return False return True
[docs] def get_start_time(self) -> datetime: return self._start_time
[docs] def get_status(self) -> SparkJobStatus: code = self._process.poll() if code is None: if not self.check_if_started(): return SparkJobStatus.STARTING return SparkJobStatus.IN_PROGRESS if code != 0: return SparkJobStatus.FAILED return SparkJobStatus.COMPLETED
[docs] def cancel(self): self._process.terminate()
[docs]class StandaloneClusterBatchIngestionJob(StandaloneClusterJobMixin, BatchIngestionJob): """ Batch Ingestion job result for a standalone spark cluster """ def __init__( self, job_id: str, job_name: str, process: subprocess.Popen, ui_port: int, feature_table: str, ) -> None: super().__init__(job_id, job_name, process, ui_port) self._feature_table = feature_table
[docs] def get_feature_table(self) -> str: return self._feature_table
[docs]class StandaloneClusterStreamingIngestionJob( StandaloneClusterJobMixin, StreamIngestionJob ): """ Streaming Ingestion job result for a standalone spark cluster """ def __init__( self, job_id: str, job_name: str, process: subprocess.Popen, ui_port: int, job_hash: str, feature_table: str, ) -> None: super().__init__(job_id, job_name, process, ui_port) self._job_hash = job_hash self._feature_table = feature_table
[docs] def get_hash(self) -> str: return self._job_hash
[docs] def get_feature_table(self) -> str: return self._feature_table
[docs]class StandaloneClusterRetrievalJob(StandaloneClusterJobMixin, RetrievalJob): """ Historical feature retrieval job result for a standalone spark cluster """ def __init__( self, job_id: str, job_name: str, process: subprocess.Popen, output_file_uri: str, ): """ This is the returned historical feature retrieval job result for StandaloneClusterLauncher. Args: job_id (str): Historical feature retrieval job id. process (subprocess.Popen): Pyspark driver process, spawned by the launcher. output_file_uri (str): Uri to the historical feature retrieval job output file. """ super().__init__(job_id, job_name, process) self._output_file_uri = output_file_uri
[docs] def get_output_file_uri(self, timeout_sec: int = None, block=True): if not block: return self._output_file_uri with self._process as p: try: p.wait(timeout_sec) return self._output_file_uri except Exception: p.kill() raise SparkJobFailure("Timeout waiting for subprocess to return") if self._process.returncode != 0: stderr = "" if self._process.stderr is None else self._process.stderr.read() stdout = "" if self._process.stdout is None else self._process.stdout.read() raise SparkJobFailure( f"Non zero return code: {self._process.returncode}. stderr: {stderr} stdout: {stdout}" ) return self._output_file_uri
[docs]class StandaloneClusterLauncher(JobLauncher): """ Submits jobs to a standalone Spark cluster in client mode. """ def __init__(self, master_url: str, spark_home: str = None): """ This launcher executes the spark-submit script in a subprocess. The subprocess will run until the Pyspark driver exits. Args: master_url (str): Spark cluster url. Must start with spark://. spark_home (str): Local file path to Spark installation directory. If not provided, the environmental variable `SPARK_HOME` will be used instead. """ self.master_url = master_url self.spark_home = spark_home if spark_home else os.getenv("SPARK_HOME") @property def spark_submit_script_path(self): return os.path.join(self.spark_home, "bin/spark-submit")
[docs] def spark_submit( self, job_params: SparkJobParameters, ui_port: int = None ) -> subprocess.Popen: submission_cmd = [ self.spark_submit_script_path, "--master", self.master_url, "--name", job_params.get_name(), ] if job_params.get_class_name(): submission_cmd.extend(["--class", job_params.get_class_name()]) if ui_port: submission_cmd.extend(["--conf", f"spark.ui.port={ui_port}"]) # Workaround for https://github.com/apache/spark/pull/26552 # Fix running spark job with bigquery connector (w/ shadowing) on JDK 9+ submission_cmd.extend( [ "--conf", "spark.executor.extraJavaOptions=" "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT", "--conf", "spark.driver.extraJavaOptions=" "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT", "--conf", "spark.sql.session.timeZone=UTC", # ignore local timezone "--packages", ",".join([BQ_SPARK_PACKAGE] + job_params.get_extra_packages()), "--jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-latest.jar," "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/2.7.3/hadoop-aws-2.7.3.jar," "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.7.4/aws-java-sdk-1.7.4.jar", "--conf", "spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem", "--conf", "spark.hadoop.fs.gs.impl=com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem", ] ) submission_cmd.append(job_params.get_main_file_path()) submission_cmd.extend(job_params.get_arguments()) return subprocess.Popen(submission_cmd)
[docs] def historical_feature_retrieval( self, job_params: RetrievalJobParameters ) -> RetrievalJob: job_id = str(uuid.uuid4()) job = StandaloneClusterRetrievalJob( job_id, job_params.get_name(), self.spark_submit(job_params), job_params.get_destination_path(), ) global_job_cache.add_job(job) return job
[docs] def offline_to_online_ingestion( self, ingestion_job_params: BatchIngestionJobParameters ) -> BatchIngestionJob: job_id = str(uuid.uuid4()) ui_port = _find_free_port() job = StandaloneClusterBatchIngestionJob( job_id, ingestion_job_params.get_name(), self.spark_submit(ingestion_job_params, ui_port), ui_port, ingestion_job_params.get_feature_table_name(), ) global_job_cache.add_job(job) return job
[docs] def schedule_offline_to_online_ingestion( self, ingestion_job_params: ScheduledBatchIngestionJobParameters ): raise NotImplementedError( "Schedule spark jobs are not supported by standalone launcher" )
[docs] def unschedule_offline_to_online_ingestion(self, project: str, feature_table: str): raise NotImplementedError( "Unschedule spark jobs are not supported by standalone launcher" )
[docs] def start_stream_to_online_ingestion( self, ingestion_job_params: StreamIngestionJobParameters ) -> StreamIngestionJob: job_id = str(uuid.uuid4()) ui_port = _find_free_port() job = StandaloneClusterStreamingIngestionJob( job_id, ingestion_job_params.get_name(), self.spark_submit(ingestion_job_params, ui_port), ui_port, ingestion_job_params.get_job_hash(), ingestion_job_params.get_feature_table_name(), ) global_job_cache.add_job(job) return job
[docs] def get_job_by_id(self, job_id: str) -> SparkJob: return global_job_cache.get_job_by_id(job_id)
[docs] def list_jobs( self, include_terminated: bool, project: Optional[str], table_name: Optional[str], ) -> List[SparkJob]: if include_terminated is True: return global_job_cache.list_jobs() else: return [ job for job in global_job_cache.list_jobs() if job.get_status() in (SparkJobStatus.STARTING, SparkJobStatus.IN_PROGRESS) ]