AWS Glue Docker ignoring customJdbcDriverClassName and customJdbcDriverS3Path - python

Attempting to run a very trivial Glue script locally via Docker I can't seem to connect to a mysql database running also in docker.
My docker setup is:
version: '3.7'
services:
glue:
container_name: "dev-glue"
image: amazon/aws-glue-libs:glue_libs_3.0.0_image_01-arm64
ports:
- "4040:4040"
- "18080:18080"
volumes:
- ~/.aws:/home/glue_user/.aws
- /workspace:/home/glue_user/workspace/
environment:
- "AWS_PROFILE=$AWS_PROFILE"
- "AWS_REGION=us-west-2"
- "AWS_DEFAULT_REGION=us-west-2"
- "DISABLE_SSL=true"
stdin_open: true
mysql:
image: mysql:8.0
container_name: 'dev-mysql'
command: --default-authentication-plugin=mysql_native_password
restart: always
environment:
MYSQL_ROOT_PASSWORD: password
volumes:
- mysql-db:/var/lib/mysql
ports:
- '3306:3306'
volumes:
mysql-db:
And according to the documentation found here the following should work without a problem
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
glueContext = GlueContext(SparkContext.getOrCreate())
def df_mysql(glue_context: GlueContext, schema: str, table: str):
connection = {
"url": f"jdbc:mysql://dev-mysql/{schema}",
"dbtable": table,
"user": "root",
"password": "password",
"customJdbcDriverS3Path": "s3://my-bucket/mysql-connector-java-8.0.17.jar",
"customJdbcDriverClassName": "com.mysql.cj.jdbc.Driver"
}
data_frame: DynamicFrame = glue_context.create_dynamic_frame.from_options(connection_type="mysql", connection_options=connection)
data_frame.printSchema()
df_mysql(glueContext, "my_schema", "my_table")
However this fails with
Traceback (most recent call last):
File "/home/glue_user/workspace/local/local_mysql.py", line 25, in <module>
df_mysql(glueContext, "my_schema", "my_table")
File "/home/glue_user/workspace/local/local_mysql.py", line 19, in df_mysql
data_frame: DynamicFrame = glue_context.create_dynamic_frame.from_options(connection_type="mysql", connection_options=connection)
File "/home/glue_user/aws-glue-libs/PyGlue.zip/awsglue/dynamicframe.py", line 608, in from_options
File "/home/glue_user/aws-glue-libs/PyGlue.zip/awsglue/context.py", line 228, in create_dynamic_frame_from_options
File "/home/glue_user/aws-glue-libs/PyGlue.zip/awsglue/data_source.py", line 36, in getFrame
File "/home/glue_user/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
File "/home/glue_user/spark/python/pyspark/sql/utils.py", line 111, in deco
return f(*a, **kw)
File "/home/glue_user/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o45.getDynamicFrame.
: java.lang.ClassNotFoundException: com.mysql.cj.jdbc.Driver
at java.net.URLClassLoader.findClass(URLClassLoader.java:387)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:46)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1(JDBCOptions.scala:102)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1$adapted(JDBCOptions.scala:102)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:102)
at org.apache.spark.sql.jdbc.glue.GlueJDBCOptions.<init>(GlueJDBCOptions.scala:14)
at org.apache.spark.sql.jdbc.glue.GlueJDBCOptions.<init>(GlueJDBCOptions.scala:17)
at org.apache.spark.sql.jdbc.glue.GlueJDBCSource$.createRelation(GlueJDBCSource.scala:29)
at com.amazonaws.services.glue.util.JDBCWrapper.tableDF(JDBCUtils.scala:878)
at com.amazonaws.services.glue.util.NoCondition$.tableDF(JDBCUtils.scala:86)
at com.amazonaws.services.glue.util.NoJDBCPartitioner$.tableDF(JDBCUtils.scala:172)
at com.amazonaws.services.glue.JDBCDataSource.getDynamicFrame(DataSource.scala:967)
at com.amazonaws.services.glue.DataSource.getDynamicFrame(DataSource.scala:99)
at com.amazonaws.services.glue.DataSource.getDynamicFrame$(DataSource.scala:99)
at com.amazonaws.services.glue.SparkSQLDataSource.getDynamicFrame(DataSource.scala:714)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
Whats interesting here is its failing due to java.lang.ClassNotFoundException: com.mysql.cj.jdbc.Driver but if I change the config to use something else, it will fail with the same error message.
Is there some other environment variable or configuration I'm supposed to set in order for this to work?

Related

Error while Inserting spark dataframe into SQL Server

I am inserting spark dataframe into a Table in SQL Server Database using pyspark...
I am using pycharm as IDE,
here is my code snippet,
server_name = "SERVER"
database_name = "DB"
url = "jdbc:sqlserver://{"+server_name+"} "+ ";" + "databaseName=" + database_name + ";"
table_name = "Table_Temp"
username = "USER"
password = "PASS"
df_target.write \
.format("com.microsoft.sqlserver.jdbc.spark") \
.mode("overwrite") \
.option("url", url) \
.option("dbtable", table_name) \
.option("user", username) \
.option("password", password) \
.save()
while executing the above snippet, I got the following error
Traceback (most recent call last):
File "E:\python\SCD2.py", line 129, in <module>
df_target.write \
File "E:\spark-3.3.1-bin-hadoop3\python\lib\pyspark.zip\pyspark\sql\readwriter.py", line 966, in save
File "E:\spark-3.3.1-bin-hadoop3\python\lib\py4j-0.10.9.5-src.zip\py4j\java_gateway.py", line 1321, in __call__
File "E:\spark-3.3.1-bin-hadoop3\python\lib\pyspark.zip\pyspark\sql\utils.py", line 190, in deco
File "E:\spark-3.3.1-bin-hadoop3\python\lib\py4j-0.10.9.5-src.zip\py4j\protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o198.save.
: java.lang.ClassNotFoundException:
Failed to find data source: com.microsoft.sqlserver.jdbc.spark. Please find packages at
https://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.errors.QueryExecutionErrors$.failedToFindDataSourceError(QueryExecutionErrors.scala:587)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:675)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:725)
at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:864)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:256)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:247)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.microsoft.sqlserver.jdbc.spark.DefaultSource
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:661)
at scala.util.Try$.apply(Try.scala:213)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:661)
at scala.util.Failure.orElse(Try.scala:224)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:661)
I have installed Spark.
I Really dont know where I am going wrong...
Please help me...
Current support from Microsoft restricts write/overwrite/append operations on SQL Server 2008+ only to Apache Spark 2.4.x, 3.0.x & 3.1.x; while you're using Spark 3.3.1. Hence the error message!

AWS Glue Error: py4j.protocol.Py4JJavaError: An error occurred while calling o89.getCatalogSource. : java.util.NoSuchElementException: None.get

I am running the following glue script to copy certain columns from a postgres table to a bucket in S3.:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
PostgreSQL_node1668126214499 = glueContext.create_dynamic_frame.from_catalog(
database="glue-catalog-ra-db",
table_name="rentalassistance_public_addresses",
transformation_ctx="PostgreSQL_node1668126214499",
)
SqlQuery0 = """
select id, zip, city from myDataSource
"""
SQLQuery_node1668127298659 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"p1": PostgreSQL_node1668126214499},
transformation_ctx="SQLQuery_node1668127298659",
)
S3bucket_node3 = glueContext.write_dynamic_frame.from_options(
frame=SQLQuery_node1668127298659,
connection_type="s3",
format="json",
connection_options={
"path": "s3://refined-data.edquity-dev.co/demo_data/",
"partitionKeys": [],
},
transformation_ctx="S3bucket_node3",
)
job.commit()
Unfortunately, I am seeing the following error in my error logs when I try to run this job:
2022-11-11 00:43:45,504 ERROR [main] glue.ProcessLauncher (Logging.scala:logError(73)): Error from Python:Traceback (most recent call last):
File "/tmp/export-public-addresses", line 28, in <module>
transformation_ctx="PostgreSQL_node1668126214499",
File "/opt/amazon/lib/python3.6/site-packages/awsglue/dynamicframe.py", line 787, in from_catalog
return self._glue_context.create_dynamic_frame_from_catalog(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, catalog_id, **kwargs)
File "/opt/amazon/lib/python3.6/site-packages/awsglue/context.py", line 186, in create_dynamic_frame_from_catalog
makeOptions(self._sc, additional_options), catalog_id),
File "/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 111, in deco
return f(*a, **kw)
File "/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 328, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o89.getCatalogSource.
: java.util.NoSuchElementException: None.get
at scala.None$.get(Option.scala:349)
at scala.None$.get(Option.scala:347)
at com.amazonaws.services.glue.util.DataCatalogWrapper.$anonfun$getJDBCConf$1(DataCatalogWrapper.scala:218)
at scala.util.Try$.apply(Try.scala:209)
at com.amazonaws.services.glue.util.DataCatalogWrapper.getJDBCConf(DataCatalogWrapper.scala:209)
at com.amazonaws.services.glue.GlueContext.getGlueNativeJDBCSource(GlueContext.scala:487)
at com.amazonaws.services.glue.GlueContext.getCatalogSource(GlueContext.scala:320)
at com.amazonaws.services.glue.GlueContext.getCatalogSource(GlueContext.scala:185)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:750)
as well as this in another error log:
{
"Event": "GlueETLJobExceptionEvent",
"Timestamp": 1668127425524,
"Failure Reason": "Traceback (most recent call last):\n File \"/tmp/export-public-addresses\", line 28, in <module>\n transformation_ctx=\"PostgreSQL_node1668126214499\",\n File \"/opt/amazon/lib/python3.6/site-packages/awsglue/dynamicframe.py\", line 787, in from_catalog\n return self._glue_context.create_dynamic_frame_from_catalog(db, table_name, redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, catalog_id, **kwargs)\n File \"/opt/amazon/lib/python3.6/site-packages/awsglue/context.py\", line 186, in create_dynamic_frame_from_catalog\n makeOptions(self._sc, additional_options), catalog_id),\n File \"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py\", line 1305, in __call__\n answer, self.gateway_client, self.target_id, self.name)\n File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py\", line 111, in deco\n return f(*a, **kw)\n File \"/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py\", line 328, in get_return_value\n format(target_id, \".\", name), value)\npy4j.protocol.Py4JJavaError: An error occurred while calling o89.getCatalogSource.\n: java.util.NoSuchElementException: None.get\n\tat scala.None$.get(Option.scala:349)\n\tat scala.None$.get(Option.scala:347)\n\tat com.amazonaws.services.glue.util.DataCatalogWrapper.$anonfun$getJDBCConf$1(DataCatalogWrapper.scala:218)\n\tat scala.util.Try$.apply(Try.scala:209)\n\tat com.amazonaws.services.glue.util.DataCatalogWrapper.getJDBCConf(DataCatalogWrapper.scala:209)\n\tat com.amazonaws.services.glue.GlueContext.getGlueNativeJDBCSource(GlueContext.scala:487)\n\tat com.amazonaws.services.glue.GlueContext.getCatalogSource(GlueContext.scala:320)\n\tat com.amazonaws.services.glue.GlueContext.getCatalogSource(GlueContext.scala:185)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:750)\n",
"Stack Trace": [
{
"Declaring Class": "get_return_value",
"Method Name": "format(target_id, \".\", name), value)",
"File Name": "/opt/amazon/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py",
"Line Number": 328
},
{
"Declaring Class": "deco",
"Method Name": "return f(*a, **kw)",
"File Name": "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py",
"Line Number": 111
},
...
I was wondering if anyone has successfully solved this issue in the past?
Thus far I've tried removing the column selector to try and simplify the overall process to 2 steps but that did not really make a difference.
Additioanlly, I've tried increasing memory/node allocation but neither of these made a difference either.
EDIT: Just tried viewing the data through the "data preview" option on glue's studio's visual editor and I see the same error. Seems like its clearly unable to locate the data even though the crawler seems to be working correctly as it pulls in the correct tables.
If I'm understanding this right it looks like an issue with locating the catalog database and table. But I used the visual tool to construct this so I'm not sure why it wouldn't be able to find those crawled tables.

Spark JDBC error connecting to PostgreSQL

To begin, I have seen a few posts on this, but did not have much luck with any of the fixes.
I currently have the following code:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
if __name__ == '__main__':
scSpark = SparkSession.builder.appName("postgres") \
.config("spark.driver.extraClassPath", "C:/Spark/spark-3.0.1-bin-hadoop2.7/jars/postgresql-42.2.18.jar") \
.getOrCreate()
data_file = './data.csv'
sdfData = scSpark.read.csv(data_file, header=True, sep=',').cache()
sdfData.registerTempTable('sales')
scSpark = SparkSession.builder.appName("postgres") \
.config("spark.driver.extraClassPath", "C:/Spark/spark-3.0.1-bin-hadoop2.7/jars/postgresql-42.2.18.jar") \
.getOrCreate()
output = scSpark.sql('SELECT * from sales')
output.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/spark',driver='com.mysql.cj.jdbc.Driver',dbtable='city_info',user='postgres',password='password').mode('append').save()
When running this code, I get the following error:
Traceback (most recent call last):
File "main.py", line 20, in <module>
output.write.format('jdbc').options(url='jdbc:postgresql://localhost:5432/spark',driver='com.mysql.cj.jdbc.Driver',dbtable='city_info',user='postgres',password='password').mode('append').save()
File "C:\Users\jackt\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pyspark\sql\readwriter.py", line 825, in save
self._jwrite.save()
File "C:\Users\jackt\AppData\Local\Programs\Python\Python38-32\lib\site-packages\py4j\java_gateway.py", line 1304, in __call__
return_value = get_return_value(
File "C:\Users\jackt\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pyspark\sql\utils.py", line 128, in deco
return f(*a, **kw)
File "C:\Users\jackt\AppData\Local\Programs\Python\Python38-32\lib\site-packages\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o49.save.
: java.lang.ClassNotFoundException: com.mysql.cj.jdbc.Driver
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:45)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1(JDBCOptions.scala:99)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.$anonfun$driverClass$1$adapted(JDBCOptions.scala:99)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions.<init>(JDBCOptions.scala:99)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:194)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcOptionsInWrite.<init>(JDBCOptions.scala:198)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:45)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:963)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:963)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:415)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:399)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
I know that the error is java.lang.ClassNotFoundException: com.mysql.cj.jdbc.Driver. However, I manually add it to the classpath when I begin the session, and I ensured that the file had full permissions given to the Users group.
Change the driver parameter when saving the DataFrame into database. You need to use org.postgresql.Driver for Postgresql:
output.write.format('jdbc').options(
url='jdbc:postgresql://localhost:5432/spark',
driver='org.postgresql.Driver',
dbtable='city_info',
user='postgres',
password='password'
).mode('append').save()
Wrong JDBC driver
com.mysql.cj.jdbc.Driver is for MySQL, not for Postgres. These are two different, competing database server products.
For accessing Postgres, you have a choice of JDBC driver vendors:
PostgreSQL JDBC Driver from postgresql.org
PGJDBC-NG from impossibl, for JDBC 4.2 and later.
Commercial vendors, such as OpenLink Software

Docker-compose with Airflow - MS SQL Server (connection failed)

I´m not able to connect in the SQL Server inside Airflow using docker-compose. I want to take data from SQL Server direct to Cloud Storage and then the data will be sent to Big Query.
How to solve this?
import json
from datetime import timedelta, datetime
from airflow import DAG
from airflow.models import Variable
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator
from airflow.contrib.operators.file_to_gcs import FileToGoogleCloudStorageOperator
from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator
from airflow.contrib.operators.mysql_to_gcs import MySqlToGoogleCloudStorageOperator
default_args = {
'owner': 'Test Data',
'depends_on_past': True,
'start_date': datetime(2019, 5, 29),
'end_date': datetime(2019, 5, 30),
'email': ['email#clientx.com.br'],
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
# Set Schedule: Run pipeline once a day.
# Use cron to define exact time. Eg. 8:15am would be "15 08 * * *"
schedule_interval = "* * * * *"
# Define DAG: Set ID and assign default args and schedule interval
dag = DAG(
'bigquery_github_trends',
default_args=default_args,
schedule_interval=schedule_interval
)
extract = MySqlToGoogleCloudStorageOperator(
task_id='chama_extract',
mysql_conn_id='mysql_hml',
google_cloud_storage_conn_id='my_gcp_conn',
sql="""SELECT * FROM test""",
bucket='my_bucket',
filename='test/test{}.json',
schema_filename='schemas/test.json',
dag=dag)
load = GoogleCloudStorageToBigQueryOperator(
task_id='chama_load',
bigquery_conn_id='my_gcp_conn',
google_cloud_storage_conn_id='my_gcp_conn',
bucket='my_bucket',
destination_project_dataset_table="tst.teste123",
source_objects=['test/test0.json'],
schema_object='schemas/test.json',
source_format='NEWLINE_DELIMITED_JSON',
create_disposition='CREATE_IF_NEEDED',
write_disposition='WRITE_TRUNCATE',
dag=dag)
# Setting up Dependencies
load.set_upstream(extract)
Docker-compose.yml
version: '3'
services:
postgres:
image: postgres:9.6
environment:
- POSTGRES_USER=airflow
- POSTGRES_PASSWORD=airflow
- POSTGRES_DB=airflow
ports:
- "5432:5432"
webserver:
image: puckel/docker-airflow:1.10.1
build:
context: https://github.com/puckel/docker-airflow.git#1.10.1
dockerfile: Dockerfile
args:
AIRFLOW_DEPS: gcp_api,s3
restart: always
depends_on:
- postgres
environment:
- LOAD_EX=n
- EXECUTOR=Local
- FERNET_KEY=jsDPRErfv8Z_eVTnGfF8ywd19j4pyqE3NpdUBA_oRTo=
volumes:
- ./examples/intro-example/dags:/usr/local/airflow/dags
# Uncomment to include custom plugins
# - ./plugins:/usr/local/airflow/plugins
ports:
- "8080:8080"
command: webserver
healthcheck:
test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
interval: 30s
timeout: 30s
retries: 3
docker-compose-gcloud.yml
version: '3'
services:
postgres:
image: postgres:9.6
environment:
- POSTGRES_USER=airflow
- POSTGRES_PASSWORD=airflow
- POSTGRES_DB=airflow
ports:
- "5432:5432"
webserver:
image: puckel/docker-airflow:1.10.1
build:
context: https://github.com/puckel/docker-airflow.git#1.10.1
dockerfile: Dockerfile
args:
AIRFLOW_DEPS: gcp_api,s3
restart: always
depends_on:
- postgres
environment:
- LOAD_EX=n
- EXECUTOR=Local
- FERNET_KEY=jsDPRErfv8Z_eVTnGfF8ywd19j4pyqE3NpdUBA_oRTo=
volumes:
- ./examples/gcloud-example/dags:/usr/local/airflow/dags
# Uncomment to include custom plugins
# - ./plugins:/usr/local/airflow/plugins
ports:
- "8080:8080"
command: webserver
healthcheck:
test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
interval: 30s
timeout: 30s
retries: 3
And execute in docker the command:
docker-compose -f docker-compose-gcloud.yml up
--abort-on-container-exit
Error message in Airflow:
[2019-05-29 07:00:37,938] {{logging_mixin.py:95}} INFO - [2019-05-29 07:00:37,937] {{base_hook.py:83}} INFO - Using connection to: 10.0.0.1
[2019-05-29 07:00:58,974] {{models.py:1760}} ERROR - (2003, 'Can\'t connect to MySQL server on 10.0.0.1 (111 "Connection refused")')
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/airflow/models.py", line 1659, in _run_raw_task
result = task_copy.execute(context=context)
File "/usr/local/lib/python3.6/site-packages/airflow/contrib/operators/mysql_to_gcs.py", line 105, in execute
cursor = self._query_mysql()
File "/usr/local/lib/python3.6/site-packages/airflow/contrib/operators/mysql_to_gcs.py", line 127, in _query_mysql
conn = mysql.get_conn()
File "/usr/local/lib/python3.6/site-packages/airflow/hooks/mysql_hook.py", line 103, in get_conn
conn = MySQLdb.connect(**conn_config)
File "/usr/local/lib/python3.6/site-packages/MySQLdb/init.py", line 84, in Connect
return Connection(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/MySQLdb/connections.py", line 164, in init
super(Connection, self).init(*args, **kwargs2)
MySQLdb._exceptions.OperationalError: (2003, 'Can\'t connect to MySQL server on 10.0.0.1 (111 "Connection refused")')
[2019-05-29 07:00:58,988] {{models.py:1789}} INFO - All retries failed; marking task as FAILED
[2019-05-29 07:00:58,992] {{logging_mixin.py:95}} INFO - [2019-05-29 07:00:58,991] {{configuration.py:255}} WARNING - section/key [smtp/smtp_user] not found in config
[2019-05-29 07:00:58,998] {{models.py:1796}} ERROR - [Errno 99] Cannot assign requested address
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/airflow/models.py", line 1659, in _run_raw_task
result = task_copy.execute(context=context)
File "/usr/local/lib/python3.6/site-packages/airflow/contrib/operators/mysql_to_gcs.py", line 105, in execute
cursor = self._query_mysql()
File "/usr/local/lib/python3.6/site-packages/airflow/contrib/operators/mysql_to_gcs.py", line 127, in _query_mysql
conn = mysql.get_conn()
File "/usr/local/lib/python3.6/site-packages/airflow/hooks/mysql_hook.py", line 103, in get_conn
conn = MySQLdb.connect(**conn_config)
File "/usr/local/lib/python3.6/site-packages/MySQLdb/init.py", line 84, in Connect
return Connection(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/MySQLdb/connections.py", line 164, in init
super(Connection, self).init(*args, **kwargs2)
MySQLdb._exceptions.OperationalError: (2003, 'Can\'t connect to MySQL server on 10.0.0.1 (111 "Connection refused")')
From the error, the key part to me seems to be the "get_conn" piece. This indicates that when airflow is trying to establish the connection to the database it fails. This means either your connection is not specified (it looks like it might be) or that some part of it is incorrect.
You should check the password, server address, and port are all correct. These should be in either you airflow.cfg, as environment variables, or set in the webserver (Admin panel)

Error While writing pyspark dataframe to MySQL database

I am getting the following error:
"Caused by: java.lang.NoSuchMethodException: org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper.<init>()" while writing pyspark dataframe to mysql database
spark-submit command:
spark-submit --deploy-mode client --master yarn --conf spark.pyspark.python=/usr/bin/python3 --packages mysql:mysql-connector-java:8.0.12 s3://aramark-files/test_pyspark.py
And I am writing using:
df.write.jdbc(url="jdbc:mysql://dbhost/dbname", table="tablename", mode="append", properties={"user":"dbuser", "password": "s3cret"})
Below is the error I am getting after executing the above spark-submit command:
Traceback (most recent call last):
File "/mnt/tmp/spark-8bb457ce-fc88-4384-af58-9e52e2d6e21a/test_pyspark.py", line 51, in <module>
df.write.jdbc(jdbcUrl, where, mode='append', properties=dbProperties)
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 942, in jdbc
File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco
File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o79.jdbc.
: java.lang.InstantiationException: org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper
at java.lang.Class.newInstance(Class.java:427)
at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:53)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:55)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:54)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:63)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
at org.apache.spark.sql.DataFrameWriter.jdbc(DataFrameWriter.scala:499)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NoSuchMethodException: org.apache.spark.sql.execution.datasources.jdbc.DriverWrapper.<init>()
at java.lang.Class.getConstructor0(Class.java:3082)
at java.lang.Class.newInstance(Class.java:412)
... 34 more
I ran across the same problem in the Scala API. I'm reading from and writing to an Oracle 12c database, and both the DataFrameReader and the DataFrameWriter require the "driver" property to be set, in my case to "oracle.jdbc.OracleDriver", or else the former blows up with "No suitable driver" and the latter blows up with NoSuchMethodException.
I would therefore suggest you try
df.write.jdbc(url="jdbc:mysql://dbhost/dbname", table="tablename", mode="append", properties={"user":"dbuser", "password": "s3cret", "driver": "com.mysql.cj.jdbc.Driver" })
Where I've substituted the MySQL driver class name from the docs.

Categories

Resources