Skip to content

Commit 8161b4b

Browse files
authored
42 support spark resource configuration in notebook (#318)
* Update SparkSession configuration and add API request in startup.py * Add logging to create_spark_dev function in startup.py * Refactor create_spark_dev function in startup.py * Refactor create_spark_dev function and add logging in startup.py * Refactor create_spark_dev function and add logging in startup.py * Refactor create_spark_dev function and update API request in startup.py * Refactor create_spark_dev function and handle exception in startup.py * Refactor create_spark_dev function and update API request in startup.py * Refactor create_spark_dev function and update API request in startup.py * Refactor create_spark_dev function and update API request in startup.py * Refactor create_spark_dev function and update API request in startup.py * Refactor create_spark_dev function and update API request in startup.py * Refactor create_spark_dev function and handle exception in startup.py * Refactor create_spark_dev function and update API request in startup.py * Update config variable name in startup.py
1 parent 3c09c09 commit 8161b4b

File tree

6 files changed

+105
-77
lines changed

6 files changed

+105
-77
lines changed

docker/notebook/startup.py

Lines changed: 68 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,14 @@
55
from IPython import get_ipython
66
from IPython.display import *
77
from kubernetes import client, config
8+
import requests
9+
import logging
810

911
environment = os.getenv('ENVIRONMENT', 'development') # Default to 'development' if not set
1012

13+
logger = logging.getLogger(__name__)
14+
logger.setLevel(logging.INFO)
15+
1116
# Set the environment variables
1217
def set_env():
1318
# kubernetes_host = os.environ.get('KUBERNETES_SERVICE_HOST')
@@ -27,56 +32,57 @@ def set_env():
2732

2833
# Create a Spark session
2934
# def create_spark(app_name, master_url):
30-
spark = SparkSession.builder \
31-
.appName(app_name) \
32-
.master(kubernetes_url) \
33-
.config("spark.submit.deployMode", "client") \
34-
.config("spark.driver.host", driver_host) \
35-
.config("spark.driver.cores", "1") \
36-
.config("spark.driver.memory", "1g") \
37-
.config("spark.executor.instances", "1") \
38-
.config("spark.executor.cores", "1") \
39-
.config("spark.executor.memory", "1g") \
40-
.config("spark.kubernetes.namespace", namespace) \
41-
.config("spark.kubernetes.container.image", executor_image) \
42-
.config("spark.kubernetes.authenticate.driver.serviceAccountName", service_account) \
43-
.config("spark.kubernetes.authenticate.executor.serviceAccountName", service_account) \
44-
.config("spark.eventLog.enabled", "true") \
45-
.config("spark.eventLog.dir", f"gs://{bucket_name}/event-logs/") \
46-
.config("spark.history.fs.logDirectory", f"gs://{bucket_name}/event-logs/") \
47-
.config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
48-
.config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
49-
.config("spark.hadoop.fs.gs.auth.service.account.enable", "true") \
50-
.getOrCreate()
35+
# spark = SparkSession.builder \
36+
# .appName(app_name) \
37+
# .master(kubernetes_url) \
38+
# .config("spark.submit.deployMode", "client") \
39+
# .config("spark.driver.host", driver_host) \
40+
# .config("spark.driver.cores", "1") \
41+
# .config("spark.driver.memory", "1g") \
42+
# .config("spark.executor.instances", "1") \
43+
# .config("spark.executor.cores", "1") \
44+
# .config("spark.executor.memory", "1g") \
45+
# .config("spark.kubernetes.namespace", namespace) \
46+
# .config("spark.kubernetes.container.image", executor_image) \
47+
# .config("spark.kubernetes.authenticate.driver.serviceAccountName", service_account) \
48+
# .config("spark.kubernetes.authenticate.executor.serviceAccountName", service_account) \
49+
# .config("spark.eventLog.enabled", "true") \
50+
# .config("spark.eventLog.dir", f"gs://{bucket_name}/event-logs/") \
51+
# .config("spark.history.fs.logDirectory", f"gs://{bucket_name}/event-logs/") \
52+
# .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
53+
# .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
54+
# .config("spark.hadoop.fs.gs.auth.service.account.enable", "true") \
55+
# .getOrCreate()
5156

52-
return spark
57+
# return spark
5358

5459
# def start():
55-
# Configuring the API client
56-
config.load_incluster_config()
60+
# # Configuring the API client
61+
# config.load_incluster_config()
5762

58-
# Creating an API instance to interact with the K8s service
59-
v1 = client.CoreV1Api()
63+
# # Creating an API instance to interact with the K8s service
64+
# v1 = client.CoreV1Api()
6065

61-
# Fetching the service details
62-
service_name = os.environ.get("WEBUI_SERVICE_NAME", "notebook-spark-ui")
63-
service = v1.read_namespaced_service(service_name, namespace)
66+
# # Fetching the service details
67+
# service_name = os.environ.get("WEBUI_SERVICE_NAME", "notebook-spark-ui")
68+
# service = v1.read_namespaced_service(service_name, namespace)
6469

65-
webui_host = service.status.load_balancer.ingress[0].ip
66-
webui_port = spark.sparkContext.uiWebUrl.split(":")[-1]
67-
webui_url = f"http://{webui_host}:{webui_port}"
70+
# webui_host = service.status.load_balancer.ingress[0].ip
71+
# webui_port = spark.sparkContext.uiWebUrl.split(":")[-1]
72+
# webui_url = f"http://{webui_host}:{webui_port}"
6873

69-
msg = f"**App name**: {app_name}\n\n" + \
70-
f"**Master**: {kubernetes_url}\n\n" + \
71-
f"**Driver host**: {driver_host}\n\n" + \
72-
f"**Spark UI**: {webui_url}"
74+
# msg = f"**App name**: {app_name}\n\n" + \
75+
# f"**Master**: {kubernetes_url}\n\n" + \
76+
# f"**Driver host**: {driver_host}\n\n" + \
77+
# f"**Spark UI**: {webui_url}"
7378

74-
display(Markdown(msg))
79+
# display(Markdown(msg))
7580

7681
class PawMarkSparkSession:
7782

78-
def __init__(self, spark_session):
83+
def __init__(self, config_json, spark_session):
7984
self._spark_session = spark_session
85+
self._config_json = config_json
8086
self.history_server_base_url = "http://localhost:18080"
8187

8288
def __getattr__(self, name):
@@ -94,26 +100,36 @@ def _repr_html_(self):
94100
return f"""
95101
<div style="border: 1px solid #e8e8e8; padding: 10px;">
96102
<h3>Spark Session Information</h3>
103+
<p><strong>Config:</strong> {self._config_json}</p>
97104
<p><strong>Application ID:</strong> {application_id}</p>
98105
<p><strong>Spark UI:</strong> <a href="{spark_ui_link}">{spark_ui_link}</a></p>
99106
</div>
100107
"""
101108

102109
def create_spark_dev():
103-
spark = PawMarkSparkSession(SparkSession.builder \
104-
.appName("PySpark Example") \
105-
.master("spark://spark-master:7077") \
106-
.config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
107-
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
108-
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
109-
.config("spark.eventLog.enabled", "true") \
110-
.config("spark.eventLog.dir", "/opt/data/spark-events") \
111-
.config("spark.history.fs.logDirectory", "/opt/data/spark-events") \
112-
.config("spark.sql.warehouse.dir", "/opt/data/spark-warehouse") \
113-
.config("executor.memory", "1g") \
114-
.config("executor.cores", "1") \
115-
.config("spark.executor.instances", "1") \
116-
.getOrCreate())
110+
logger.info("Creating Spark session")
111+
try:
112+
config_json = requests.get("http://server:5002/spark_app/config").json()
113+
except Exception as e:
114+
config_json = 'Error loading config: ' + str(e)
115+
116+
spark = PawMarkSparkSession(
117+
config_json,
118+
SparkSession.builder \
119+
.appName("PySpark Example") \
120+
.master("spark://spark-master:7077") \
121+
.config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
122+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
123+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
124+
.config("spark.eventLog.enabled", "true") \
125+
.config("spark.eventLog.dir", "/opt/data/spark-events") \
126+
.config("spark.history.fs.logDirectory", "/opt/data/spark-events") \
127+
.config("spark.sql.warehouse.dir", "/opt/data/spark-warehouse") \
128+
.config("executor.memory", config_json['executor.memory']) \
129+
.config("executor.cores", config_json['executor.cores']) \
130+
.config("spark.executor.instances", config_json['spark.executor.instances']) \
131+
.getOrCreate()
132+
)
117133

118134
return spark
119135

examples/user_0@gmail.com/quickstart.ipynb

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "markdown",
5+
"isExecuted": true,
56
"lastExecutionResult": null,
67
"lastExecutionTime": null,
78
"metadata": {},
@@ -21,6 +22,7 @@
2122
},
2223
{
2324
"cell_type": "markdown",
25+
"isExecuted": true,
2426
"lastExecutionResult": null,
2527
"lastExecutionTime": null,
2628
"metadata": {},
@@ -31,6 +33,7 @@
3133
{
3234
"cell_type": "code",
3335
"execution_count": null,
36+
"isExecuted": false,
3437
"lastExecutionResult": null,
3538
"lastExecutionTime": null,
3639
"metadata": {},
@@ -40,28 +43,10 @@
4043
"import pandas as pd"
4144
]
4245
},
43-
{
44-
"cell_type": "code",
45-
"execution_count": null,
46-
"lastExecutionResult": null,
47-
"lastExecutionTime": null,
48-
"metadata": {},
49-
"outputs": [
50-
{
51-
"name": "stdout",
52-
"output_type": "stream",
53-
"text": [
54-
"666\n"
55-
]
56-
}
57-
],
58-
"source": [
59-
"print(666)"
60-
]
61-
},
6246
{
6347
"cell_type": "code",
6448
"execution_count": 4,
49+
"isExecuted": false,
6550
"lastExecutionResult": null,
6651
"lastExecutionTime": null,
6752
"metadata": {},
@@ -93,6 +78,7 @@
9378
{
9479
"cell_type": "code",
9580
"execution_count": 5,
81+
"isExecuted": false,
9682
"lastExecutionResult": null,
9783
"lastExecutionTime": null,
9884
"metadata": {},
@@ -105,6 +91,7 @@
10591
{
10692
"cell_type": "code",
10793
"execution_count": null,
94+
"isExecuted": false,
10895
"lastExecutionResult": null,
10996
"lastExecutionTime": null,
11097
"metadata": {},
@@ -129,6 +116,7 @@
129116
{
130117
"cell_type": "code",
131118
"execution_count": null,
119+
"isExecuted": false,
132120
"lastExecutionResult": null,
133121
"lastExecutionTime": null,
134122
"metadata": {},
@@ -153,6 +141,15 @@
153141
"source": [
154142
"spark.sql(\"select * from test\").show()"
155143
]
144+
},
145+
{
146+
"cell_type": "code",
147+
"execution_count": null,
148+
"metadata": {},
149+
"outputs": [],
150+
"source": [
151+
"spark.stop()"
152+
]
156153
}
157154
],
158155
"metadata": {
@@ -172,7 +169,8 @@
172169
"nbconvert_exporter": "python",
173170
"pygments_lexer": "ipython3",
174171
"version": "3.11.6"
175-
}
172+
},
173+
"uuid": "f193bc0d-d108-4ceb-babc-9fc341dfa83d"
176174
},
177175
"nbformat": 4,
178176
"nbformat_minor": 4

server/app/routes/spark_app.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,8 @@
1010
def create_spark_app(spark_app_id):
1111
data = request.get_json()
1212
notebook_path = data.get('notebookPath', None)
13-
return SparkApp.create_spark_app(spark_app_id=spark_app_id, notebook_path=notebook_path)
13+
return SparkApp.create_spark_app(spark_app_id=spark_app_id, notebook_path=notebook_path)
14+
15+
@spark_app_blueprint.route('/spark_app/config', methods=['GET'])
16+
def get_spark_app_config():
17+
return SparkApp.get_spark_app_config()

server/app/services/spark_app.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,19 @@ def get_spark_app_by_id(spark_app_id: str = None):
4040
status=200
4141
)
4242

43+
@staticmethod
44+
def get_spark_app_config():
45+
spark_app_config = {
46+
'executor.memory': '512m',
47+
'executor.cores': '1',
48+
'spark.executor.instances': '1',
49+
}
50+
51+
return Response(
52+
response=json.dumps(spark_app_config),
53+
status=200
54+
)
55+
4356
@staticmethod
4457
def create_spark_app(spark_app_id: str = None, notebook_path: str = None):
4558
logger.info(f"Creating spark app with id: {spark_app_id} for notebook path: {notebook_path}")

webapp/src/components/auth/LoginForm.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ function LoginForm({ onLogin }) {
3333
} else {
3434
// The login failed
3535
// You might want to show an error message here
36-
console.error('Failed to log in');
36+
console.error('Failed to log in: ', response.text, response.status);
3737
setError(true);
3838
}
3939
};

webapp/src/models/SparkModel.js

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@ class SparkModel {
88
static isSparkInfo(html) {
99
const parser = new DOMParser();
1010
const doc = parser.parseFromString(html, 'text/html');
11-
console.log('html:', html);
12-
console.log('doc:', doc);
13-
1411
// Check if the HTML includes Spark info
1512
const sparkInfo = doc.querySelector('h3');
1613
console.log('sparkInfo:', sparkInfo);

0 commit comments

Comments
 (0)