Hello I'm trying to Oracle Partitioned table to Datalake parquet file.
Using this script
# Convert it to Spark SQL table and save it as parquet format
df.write \
.format("parquet") \
.option("path","/archive/" + schema_name + "/" + table_name + ".parquet") \
.mode("append") \
.saveAsTable(table_name)
This code get all data of table not partition.
spark = SparkSession.builder \
.appName("Load " + schema_name + " " + table_name + " from Oracle into Parquet and creating Table") \
.getOrCreate()
This one is Creating table from Oracle
How can i get this only parquet :)
you said when you select from the table you are getting data from the table and you want from a particular partition. Did you try putting the partition name using the syntax PARTION(partition_name).
How many partitions do you have, if they are not too many then you can try creating a view for each partition and then selecting data from the view.
Created table in oracle named Checkes
Add partition name to table
after I can read this partition name from Spark.
query = '(select partition_name from Schema.checkes c) checkes'
df = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
print("part count: " + str(df.count()))
if df.count() > 0:partition_name = df.select("partition_name").collect()[0]["partition_name"]
df1 = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query1) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
```
Related
I am trying to insert 30 Millions records from Data-bricks to azure SQL. SPN ID validity is 65 mins, so single insertion is not happening. I am trying to insert data with batches each batch has 2M records and I am generating new token for each batch, but still I am getting same error after 4 batches (after inserting 6M records (after 1:30 hrs it's failing)).
Error : Token Expired
table_name = "TABLE NAME"
if count <= 2000000:
access_token,connection_string = Service_Principal()
df.write.format("jdbc") \
.mode("append") \
.option("url", jdbcUrl) \
.option("dbtable", table_name) \
.option("accessToken", access_token) \
.option("encrypt", "true") \
.option("hostNameInCertificate", "") \
.option("driver","")\
.save()
else:
chunk=2000000
id1 = 0
id2 = chunk
c =count
while id1 < c:
print("Insertion STARTED at : "+ str(datetime.datetime.now()))
stop_df = final_df.filter( (final_df.id_tmp < id2) & (final_df.id_tmp >= id1))
access_token,connection_string = Service_Principal()
df.write.format("jdbc") \
.mode("append") \
.option("url", jdbcUrl) \
.option("dbtable", table_name) \
.option("accessToken", access_token) \
.option("encrypt", "true") \
.option("hostNameInCertificate", "") \
.option("driver","")\
.save()
print("Insertion COMPLETED at : "+ str(datetime.datetime.now()))
id1+=chunk
id2+=chunk
How we can close JDBC connection in each Batch or how to delete SPN ID in each Batch
I working on extracting table count information from Azure SQL Server for over 350+ tables. As the system metadata tables are not regularly refreshed so I can't rely upon that. I written the below code to help me achieve the same -
import pyodbc
from pyspark.sql.types import *
pyodbc.pooling = False
def get_table_count(query ,server, username, password, database):
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = conn.cursor()
cursor.execute(query)
row = cursor.fetchone()
columns = StructType([StructField('tableCount', LongType(), True) , StructField('tableName', StringType(), True), StructField('databaseName', StringType(), True)])
data = [(row[0], row[1], row[2])]
df = spark.createDataFrame( data = data,schema = columns)
cursor.close()
del cursor
conn.close()
return df
import pyspark.sql.functions as F
dbList = [ SQLServerDB1 , SQLServerDB1 ]
SQLServerDB1_query = ""
SQLServerDB2_query = ""
for db in dbList:
print("Currently loading for "+db+" database")
serverName = db + "SQLServerName"
serverUser = db + "SQLServerUser"
serverPassword = db + "SQLServerPassword"
serverDB = db + "SQLServerDB"
tables=df.select('target_object').filter(F.col('source') == db).distinct().toPandas()['target_object']
for tablename in list(tables):
if tablename != list(tables)[-1]:
vars()["%s_query"%db] = f" Select count_big(*) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename} \n" + " union \n" + vars()["%s_query"%db]
else:
vars()["%s_query"%db] = vars()["%s_query"%db] + f" Select count_big(*) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename}"
vars()["%s_DF"%db] = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )
# exec(f'{db}_DF = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )')
# print(tablename + " Loaded")
Getting Below error -
('42000', "[42000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Parse error at line: 3, column: 1: Incorrect syntax near 'union'. (103010) (SQLExecDirectW)")
I tried printing the SQL statements and it worked without any issue from SQL Server DB.
Please suggest where am I writing the code incorrectly.
Tried with the below code and it works. Thanks guys for the suggestions!
def get_table_count(query ,server, username, password, database):
jdbc_url = f"jdbc:sqlserver://{server}:1433;databaseName={database}"
df_read = spark.read \
.format("jdbc") \
.option("url",jdbc_url) \
.option("query", query) \
.option("user", username) \
.option("password", password) \
.option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
.load()
df_save = df_read.write.mode('overwrite').parquet('/tmp/' + f"{database}" + '.parquet')
df = spark.read.parquet('/tmp/' + f"{database}" + '.parquet')
return df
import pyspark.sql.functions as F
dbList = [ SQLServerDB1 , SQLServerDB1 ]
SQLServerDB1_query = ""
SQLServerDB2_query = ""
for db in dbList:
print("Currently loading for "+db+" database")
serverName = db + "SQLServerName"
serverUser = db + "SQLServerUser"
serverPassword = db + "SQLServerPassword"
serverDB = db + "SQLServerDB"
tables=df.select('target_object').filter(F.col('source') == db).distinct().toPandas()['target_object']
for tablename in list(tables):
if tablename != list(tables)[-1]:
vars()["%s_query"%db] = f" Select count_big(1) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename} \n" + " union \n" + vars()["%s_query"%db]
else:
vars()["%s_query"%db] = vars()["%s_query"%db] + f" Select count_big(1) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename}"
print(vars()["%s_query"%db])
vars()["%s_DF"%db] = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )
vars()["%s_DF"%db].createOrReplaceTempView(f"{db}_tablesCount")
print(f"{db}"+ " Loaded")
I want to read data from postgres db of 1 hour time interval, I want the process to run every one hour. How can I do that? I have attached my code snippet. I am unable to use readstream for jdbc option.
df = spark.read \
.format("jdbc") \
.option("url", URL) \
.option("dbtable", "tagpool_with_tag_raw") \
.option("user", "tsdbadmin") \
.option("password", "cgqu5qss2zy3i1") \
.option("driver", "org.postgresql.Driver") \
.load()
# Getting the current date and time
dt = datetime.datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
utc_timestamp = utc_time.timestamp()
epoch = round(utc_timestamp / 60) * 60
# epoch = epoch+3600
print("epoch ", epoch)
df.createOrReplaceTempView("tagpool_with_tag_raw")
x = spark.sql("""select * from tagpool_with_tag_raw""")
x.show()
query = spark.sql("select * from tagpool_with_tag_raw WHERE input_time = " + str(epoch)) # .format()
# query = spark.sql("select CAST(input_time AS bigint), CAST(orig_time AS bigint) , from tagpool_with_tag_raw WHERE input_time = "+ epoch) #.format()
query.show()
# df.selectExpr(("SELECT * FROM public.tagpool_raw WHERE input_time<= %s".format(epoch)))
df.printSchema()
query.write \
.format("jdbc") \
.option("url", URL) \
.option("dbtable", "tagpool_tag_raw") \
.option("user", USER) \
.option("password", PW) \
.option("driver", DRIVER).save()
Readstream are not for jdbc , As jdbc is a batch operation, You will have to create a process just like what you have did and use schedulers like AutoSys or oozie or whatever your enterprise as to run every hour.
I am trying to update if record exist and insert if no record is found. using below code
for index, row in df.iterrows():
cols = "],[".join([str(i) for i in df.columns.tolist()])
cols = "([" + cols + "])"
ucols = "] = ?,[".join([str(i) for i in df.columns.tolist()])
ucols = "[" + ucols + "] = ?"
c.execute("SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;")
c.execute("BEGIN TRANSACTION;")
c.execute("UPDATE " + tblname + " SET" + ucols + " WHERE [TESTNUMBER]=" + str(row['TESTNUMBER']) + " AND [ROWNUM] =" + str(row['ROWNUM']) + ";", tuple(row))
sqlr = "IF ##ROWCOUNT = 0 " \
"BEGIN " \
"INSERT INTO " + tblname + cols +" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?); " \
"END " \
"COMMIT TRANSACTION;"
c.execute(sqlr, tuple(row))
getting below error message
{ProgrammingError}('25000', u'[25000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Transaction count after EXECUTE indicates a mismatching number of BEGIN and COMMIT statements. Previous count = 2, current count = 1. (266) (SQLExecDirectW)')
no sure what i am doing wrong. appreciate your help
I'm using spotlight via pyobjc. Which is working well except for when I try to limit the time period using kMDItemContentCreationDate. I think the issue is with my time format. Any help would be greatly appreciated.
from Cocoa import *
import sys
emails = [sys.argv[1], ]
predicate = "(kMDItemContentType = 'com.apple.mail.emlx') && (" + \
'||'.join(["((kMDItemAuthorEmailAddresses = '%s'))" % m for m in emails]) + \
"&& (kMDItemContentCreationDate > '2011-03-23 00:00:00')" + \
"&& (kMDItemContentCreationDate < '2012-03-24 00:00:00')" + \
")"
query = NSMetadataQuery.alloc().init()
query.setPredicate_(NSPredicate.predicateWithFormat_(predicate))
query.setSortDescriptors_(NSArray.arrayWithObject_(NSSortDescriptor.alloc().initWithKey_ascending_('kMDItemContentCreationDate', False)))
query.startQuery()
NSRunLoop.currentRunLoop().runUntilDate_(NSDate.dateWithTimeIntervalSinceNow_(5))
query.stopQuery()
results = query.results()[:5]
for item in results:
print "subject: ", item.valueForAttribute_("kMDItemSubject")