Created
April 26, 2023 11:47
-
-
Save rmoff/28211a28adf7c55607f7ed7e4c4efc8f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "e8e13cd9", | |
"metadata": {}, | |
"source": [ | |
"<img src=\"https://docs.lakefs.io/assets/logo.svg\" alt=\"lakeFS logo\" width=300/>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "89718426", | |
"metadata": {}, | |
"source": [ | |
"# Setup" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f824b929", | |
"metadata": {}, | |
"source": [ | |
"## Set up the connection to lakeFS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "b2e44584", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import lakefs_client\n", | |
"from lakefs_client.client import LakeFSClient\n", | |
"\n", | |
"configuration = lakefs_client.Configuration()\n", | |
"configuration.username = 'AKIAIOSFODNN7EXAMPLE'\n", | |
"configuration.password = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'\n", | |
"configuration.host = 'http://lakefs:8000'\n", | |
"\n", | |
"lakefs = LakeFSClient(configuration)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0047139f", | |
"metadata": {}, | |
"source": [ | |
"### Get the first repository present in lakeFS" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "b6bbdb80", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using lakeFS repository 'example' with storage namespace s3://example\n" | |
] | |
} | |
], | |
"source": [ | |
"repo=lakefs.repositories.list_repositories().results[0]\n", | |
"print(f\"Using lakeFS repository '{repo.id}' with storage namespace {repo.storage_namespace}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "42ab2add", | |
"metadata": {}, | |
"source": [ | |
"### Define the warehouse diretory based on the provided namespace" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "c2616c44", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using s3a://example/main/ for Spark SQL warehouse\n" | |
] | |
} | |
], | |
"source": [ | |
"warehouse_dir=repo.storage_namespace.replace('s3','s3a') + '/main/'\n", | |
"print(f\"Using {warehouse_dir} for Spark SQL warehouse\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4e4e906a", | |
"metadata": {}, | |
"source": [ | |
"## Set up Spark\n", | |
"\n", | |
"Use lakeFS for storage of Spark SQL tables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "36742058", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div>\n", | |
" <p><b>SparkSession - in-memory</b></p>\n", | |
" \n", | |
" <div>\n", | |
" <p><b>SparkContext</b></p>\n", | |
"\n", | |
" <p><a href=\"http://b4e4d2eb0014:4040\">Spark UI</a></p>\n", | |
"\n", | |
" <dl>\n", | |
" <dt>Version</dt>\n", | |
" <dd><code>v3.4.0</code></dd>\n", | |
" <dt>Master</dt>\n", | |
" <dd><code>local[*]</code></dd>\n", | |
" <dt>AppName</dt>\n", | |
" <dd><code>lakeFS / Jupyter</code></dd>\n", | |
" </dl>\n", | |
" </div>\n", | |
" \n", | |
" </div>\n", | |
" " | |
], | |
"text/plain": [ | |
"<pyspark.sql.session.SparkSession at 0xffff8d7710f0>" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from pyspark.sql import SparkSession\n", | |
"spark = SparkSession.builder.appName(\"lakeFS / Jupyter\") \\\n", | |
" .config(\"spark.hadoop.fs.s3.impl\", \"org.apache.hadoop.fs.s3a.S3AFileSystem\") \\\n", | |
" .config(\"spark.hadoop.fs.s3a.endpoint\", \"http://lakefs:8000\") \\\n", | |
" .config(\"spark.hadoop.fs.s3a.path.style.access\", \"true\") \\\n", | |
" .config(\"spark.hadoop.fs.s3a.access.key\", \"AKIAIOSFODNN7EXAMPLE\") \\\n", | |
" .config(\"spark.hadoop.fs.s3a.secret.key\", \"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\") \\\n", | |
" .config(\"spark.hadoop.fs.s3a.aws.credentials.provider\", \"org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider\") \\\n", | |
" .config(\"spark.sql.warehouse.dir\", warehouse_dir) \\\n", | |
" .getOrCreate()\n", | |
"spark.sparkContext.setLogLevel(\"INFO\")\n", | |
"\n", | |
"spark\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1fa18ec8", | |
"metadata": {}, | |
"source": [ | |
"## Create database without having written anything to the repo first" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0b056ce9", | |
"metadata": {}, | |
"source": [ | |
"This will fail" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "ab90e2e0", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "Py4JJavaError", | |
"evalue": "An error occurred while calling o38.sql.\n: org.apache.spark.SparkException: Unable to create database default as failed to create its directory s3a://example/main.\n\tat org.apache.spark.sql.errors.QueryExecutionErrors$.unableToCreateDatabaseAsFailedToCreateDirectoryError(QueryExecutionErrors.scala:550)\n\tat org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.liftedTree1$1(InMemoryCatalog.scala:126)\n\tat org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.createDatabase(InMemoryCatalog.scala:119)\n\tat org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:159)\n\tat org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:140)\n\tat org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$catalog$1(BaseSessionStateBuilder.scala:154)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:122)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:122)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.databaseExists(SessionCatalog.scala:319)\n\tat org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog.namespaceExists(V2SessionCatalog.scala:241)\n\tat org.apache.spark.sql.execution.datasources.v2.CreateNamespaceExec.run(CreateNamespaceExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)\n\tat org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)\n\tat org.apache.spark.sql.Dataset.<init>(Dataset.scala:219)\n\tat org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)\n\tat org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)\n\tat org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:640)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:630)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:662)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:568)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:833)\nCaused by: java.io.FileNotFoundException: PUT 0-byte object on main/: com.amazonaws.services.s3.model.AmazonS3Exception: Not Found (Service: Amazon S3; Status Code: 404; Error Code: 404 Not Found; Request ID: null; S3 Extended Request ID: null; Proxy: null), S3 Extended Request ID: null:404 Not Found\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:260)\n\tat org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:117)\n\tat org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:320)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:412)\n\tat org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:316)\n\tat org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:291)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.createEmptyObject(S3AFileSystem.java:4096)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.createFakeDirectory(S3AFileSystem.java:4071)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerMkdirs(S3AFileSystem.java:3038)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.mkdirs(S3AFileSystem.java:2979)\n\tat org.apache.hadoop.fs.FileSystem.mkdirs(FileSystem.java:2388)\n\tat org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.liftedTree1$1(InMemoryCatalog.scala:122)\n\t... 52 more\nCaused by: com.amazonaws.services.s3.model.AmazonS3Exception: Not Found (Service: Amazon S3; Status Code: 404; Error Code: 404 Not Found; Request ID: null; S3 Extended Request ID: null; Proxy: null), S3 Extended Request ID: null\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1828)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleServiceErrorResponse(AmazonHttpClient.java:1412)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1374)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1145)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:802)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:770)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:744)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:704)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:686)\n\tat com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:550)\n\tat com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:530)\n\tat com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5227)\n\tat com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5173)\n\tat com.amazonaws.services.s3.AmazonS3Client.access$300(AmazonS3Client.java:415)\n\tat com.amazonaws.services.s3.AmazonS3Client$PutObjectStrategy.invokeServiceCall(AmazonS3Client.java:6289)\n\tat com.amazonaws.services.s3.AmazonS3Client.uploadObject(AmazonS3Client.java:1834)\n\tat com.amazonaws.services.s3.AmazonS3Client.putObject(AmazonS3Client.java:1794)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.putObjectDirect(S3AFileSystem.java:2432)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$createEmptyObject$22(S3AFileSystem.java:4098)\n\tat org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:115)\n\t... 62 more\n", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", | |
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mget_ipython\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun_cell_magic\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msql\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43mCREATE DATABASE IF NOT EXISTS nyc\u001b[39;49m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py:2475\u001b[0m, in \u001b[0;36mInteractiveShell.run_cell_magic\u001b[0;34m(self, magic_name, line, cell)\u001b[0m\n\u001b[1;32m 2473\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuiltin_trap:\n\u001b[1;32m 2474\u001b[0m args \u001b[38;5;241m=\u001b[39m (magic_arg_s, cell)\n\u001b[0;32m-> 2475\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2477\u001b[0m \u001b[38;5;66;03m# The code below prevents the output from being displayed\u001b[39;00m\n\u001b[1;32m 2478\u001b[0m \u001b[38;5;66;03m# when using magics with decodator @output_can_be_silenced\u001b[39;00m\n\u001b[1;32m 2479\u001b[0m \u001b[38;5;66;03m# when the last Python token in the expression is a ';'.\u001b[39;00m\n\u001b[1;32m 2480\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(fn, magic\u001b[38;5;241m.\u001b[39mMAGIC_OUTPUT_CAN_BE_SILENCED, \u001b[38;5;28;01mFalse\u001b[39;00m):\n", | |
"File \u001b[0;32m~/.ipython/profile_default/startup/00-prettytables.py:81\u001b[0m, in \u001b[0;36msql\u001b[0;34m(line, cell)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _to_table(df, num_rows\u001b[38;5;241m=\u001b[39margs\u001b[38;5;241m.\u001b[39mlimit)\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _to_table(\u001b[43mspark\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcell\u001b[49m\u001b[43m)\u001b[49m)\n", | |
"File \u001b[0;32m/usr/local/spark/python/pyspark/sql/session.py:1440\u001b[0m, in \u001b[0;36mSparkSession.sql\u001b[0;34m(self, sqlQuery, args, **kwargs)\u001b[0m\n\u001b[1;32m 1438\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1439\u001b[0m litArgs \u001b[38;5;241m=\u001b[39m {k: _to_java_column(lit(v)) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m (args \u001b[38;5;129;01mor\u001b[39;00m {})\u001b[38;5;241m.\u001b[39mitems()}\n\u001b[0;32m-> 1440\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DataFrame(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_jsparkSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m(\u001b[49m\u001b[43msqlQuery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlitArgs\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 1441\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 1442\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(kwargs) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", | |
"File \u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1322\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1316\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1317\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1318\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1319\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1321\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1322\u001b[0m return_value \u001b[38;5;241m=\u001b[39m \u001b[43mget_return_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1323\u001b[0m \u001b[43m \u001b[49m\u001b[43manswer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgateway_client\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtarget_id\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1325\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(temp_arg, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_detach\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", | |
"File \u001b[0;32m/usr/local/spark/python/pyspark/errors/exceptions/captured.py:169\u001b[0m, in \u001b[0;36mcapture_sql_exception.<locals>.deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdeco\u001b[39m(\u001b[38;5;241m*\u001b[39ma: Any, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43ma\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m Py4JJavaError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 171\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n", | |
"File \u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py:326\u001b[0m, in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 324\u001b[0m value \u001b[38;5;241m=\u001b[39m OUTPUT_CONVERTER[\u001b[38;5;28mtype\u001b[39m](answer[\u001b[38;5;241m2\u001b[39m:], gateway_client)\n\u001b[1;32m 325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m answer[\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m==\u001b[39m REFERENCE_TYPE:\n\u001b[0;32m--> 326\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Py4JJavaError(\n\u001b[1;32m 327\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m 328\u001b[0m \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name), value)\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m Py4JError(\n\u001b[1;32m 331\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAn error occurred while calling \u001b[39m\u001b[38;5;132;01m{0}\u001b[39;00m\u001b[38;5;132;01m{1}\u001b[39;00m\u001b[38;5;132;01m{2}\u001b[39;00m\u001b[38;5;124m. Trace:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{3}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28mformat\u001b[39m(target_id, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m, name, value))\n", | |
"\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o38.sql.\n: org.apache.spark.SparkException: Unable to create database default as failed to create its directory s3a://example/main.\n\tat org.apache.spark.sql.errors.QueryExecutionErrors$.unableToCreateDatabaseAsFailedToCreateDirectoryError(QueryExecutionErrors.scala:550)\n\tat org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.liftedTree1$1(InMemoryCatalog.scala:126)\n\tat org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.createDatabase(InMemoryCatalog.scala:119)\n\tat org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:159)\n\tat org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:140)\n\tat org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$catalog$1(BaseSessionStateBuilder.scala:154)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog$lzycompute(SessionCatalog.scala:122)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.externalCatalog(SessionCatalog.scala:122)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.databaseExists(SessionCatalog.scala:319)\n\tat org.apache.spark.sql.execution.datasources.v2.V2SessionCatalog.namespaceExists(V2SessionCatalog.scala:241)\n\tat org.apache.spark.sql.execution.datasources.v2.CreateNamespaceExec.run(CreateNamespaceExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:43)\n\tat org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:49)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)\n\tat org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)\n\tat org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)\n\tat org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)\n\tat org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)\n\tat org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)\n\tat org.apache.spark.sql.Dataset.<init>(Dataset.scala:219)\n\tat org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)\n\tat org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)\n\tat org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:640)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:630)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:662)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)\n\tat java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.base/java.lang.reflect.Method.invoke(Method.java:568)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)\n\tat py4j.ClientServerConnection.run(ClientServerConnection.java:106)\n\tat java.base/java.lang.Thread.run(Thread.java:833)\nCaused by: java.io.FileNotFoundException: PUT 0-byte object on main/: com.amazonaws.services.s3.model.AmazonS3Exception: Not Found (Service: Amazon S3; Status Code: 404; Error Code: 404 Not Found; Request ID: null; S3 Extended Request ID: null; Proxy: null), S3 Extended Request ID: null:404 Not Found\n\tat org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:260)\n\tat org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:117)\n\tat org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:320)\n\tat org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:412)\n\tat org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:316)\n\tat org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:291)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.createEmptyObject(S3AFileSystem.java:4096)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.createFakeDirectory(S3AFileSystem.java:4071)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.innerMkdirs(S3AFileSystem.java:3038)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.mkdirs(S3AFileSystem.java:2979)\n\tat org.apache.hadoop.fs.FileSystem.mkdirs(FileSystem.java:2388)\n\tat org.apache.spark.sql.catalyst.catalog.InMemoryCatalog.liftedTree1$1(InMemoryCatalog.scala:122)\n\t... 52 more\nCaused by: com.amazonaws.services.s3.model.AmazonS3Exception: Not Found (Service: Amazon S3; Status Code: 404; Error Code: 404 Not Found; Request ID: null; S3 Extended Request ID: null; Proxy: null), S3 Extended Request ID: null\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1828)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleServiceErrorResponse(AmazonHttpClient.java:1412)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1374)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1145)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:802)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:770)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:744)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:704)\n\tat com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:686)\n\tat com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:550)\n\tat com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:530)\n\tat com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5227)\n\tat com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5173)\n\tat com.amazonaws.services.s3.AmazonS3Client.access$300(AmazonS3Client.java:415)\n\tat com.amazonaws.services.s3.AmazonS3Client$PutObjectStrategy.invokeServiceCall(AmazonS3Client.java:6289)\n\tat com.amazonaws.services.s3.AmazonS3Client.uploadObject(AmazonS3Client.java:1834)\n\tat com.amazonaws.services.s3.AmazonS3Client.putObject(AmazonS3Client.java:1794)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.putObjectDirect(S3AFileSystem.java:2432)\n\tat org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$createEmptyObject$22(S3AFileSystem.java:4098)\n\tat org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:115)\n\t... 62 more\n" | |
] | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"\n", | |
"CREATE DATABASE IF NOT EXISTS nyc" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "81c039d4", | |
"metadata": {}, | |
"source": [ | |
"### Write a dummy file to the lakeFS repo" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "4781dad2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"spark.range(0, 1).write.save('s3a://example/main/dummy_file')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a05d0534", | |
"metadata": {}, | |
"source": [ | |
"## Create the database (this will work)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "d00796d1", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table>\n", | |
" <tr>\n", | |
" </tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"++\n", | |
"||\n", | |
"++\n", | |
"++" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%sql\n", | |
"\n", | |
"CREATE DATABASE IF NOT EXISTS nyc" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b84f88ef", | |
"metadata": {}, | |
"source": [ | |
"### Validate the location" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "14814dba", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'s3a://example/main/nyc.db'" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"spark.catalog.getDatabase(\"nyc\").locationUri" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment