Oracle Partitioned table to Datalake using python - python

Hello I'm trying to Oracle Partitioned table to Datalake parquet file.
Using this script
# Convert it to Spark SQL table and save it as parquet format
df.write \
.format("parquet") \
.option("path","/archive/" + schema_name + "/" + table_name + ".parquet") \
.mode("append") \
.saveAsTable(table_name)
This code get all data of table not partition.
spark = SparkSession.builder \
.appName("Load " + schema_name + " " + table_name + " from Oracle into Parquet and creating Table") \
.getOrCreate()
This one is Creating table from Oracle
How can i get this only parquet :)

you said when you select from the table you are getting data from the table and you want from a particular partition. Did you try putting the partition name using the syntax PARTION(partition_name).
How many partitions do you have, if they are not too many then you can try creating a view for each partition and then selecting data from the view.

Created table in oracle named Checkes
Add partition name to table
after I can read this partition name from Spark.
query = '(select partition_name from Schema.checkes c) checkes'
df = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
print("part count: " + str(df.count()))
if df.count() > 0:partition_name = df.select("partition_name").collect()[0]["partition_name"]
df1 = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query1) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
```

Related

Service Principal Token Expired

I am trying to insert 30 Millions records from Data-bricks to azure SQL. SPN ID validity is 65 mins, so single insertion is not happening. I am trying to insert data with batches each batch has 2M records and I am generating new token for each batch, but still I am getting same error after 4 batches (after inserting 6M records (after 1:30 hrs it's failing)).
Error : Token Expired
table_name = "TABLE NAME"
if count <= 2000000:
access_token,connection_string = Service_Principal()
df.write.format("jdbc") \
.mode("append") \
.option("url", jdbcUrl) \
.option("dbtable", table_name) \
.option("accessToken", access_token) \
.option("encrypt", "true") \
.option("hostNameInCertificate", "") \
.option("driver","")\
.save()
else:
chunk=2000000
id1 = 0
id2 = chunk
c =count
while id1 < c:
print("Insertion STARTED at : "+ str(datetime.datetime.now()))
stop_df = final_df.filter( (final_df.id_tmp < id2) & (final_df.id_tmp >= id1))
access_token,connection_string = Service_Principal()
df.write.format("jdbc") \
.mode("append") \
.option("url", jdbcUrl) \
.option("dbtable", table_name) \
.option("accessToken", access_token) \
.option("encrypt", "true") \
.option("hostNameInCertificate", "") \
.option("driver","")\
.save()
print("Insertion COMPLETED at : "+ str(datetime.datetime.now()))
id1+=chunk
id2+=chunk
How we can close JDBC connection in each Batch or how to delete SPN ID in each Batch

Is there a way to improve the run time or optimize Python / Pyspark code?

I working on extracting table count information from Azure SQL Server for over 350+ tables. As the system metadata tables are not regularly refreshed so I can't rely upon that. I written the below code to help me achieve the same -
import pyodbc
from pyspark.sql.types import *
pyodbc.pooling = False
def get_table_count(query ,server, username, password, database):
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = conn.cursor()
cursor.execute(query)
row = cursor.fetchone()
columns = StructType([StructField('tableCount', LongType(), True) , StructField('tableName', StringType(), True), StructField('databaseName', StringType(), True)])
data = [(row[0], row[1], row[2])]
df = spark.createDataFrame( data = data,schema = columns)
cursor.close()
del cursor
conn.close()
return df
import pyspark.sql.functions as F
dbList = [ SQLServerDB1 , SQLServerDB1 ]
SQLServerDB1_query = ""
SQLServerDB2_query = ""
for db in dbList:
print("Currently loading for "+db+" database")
serverName = db + "SQLServerName"
serverUser = db + "SQLServerUser"
serverPassword = db + "SQLServerPassword"
serverDB = db + "SQLServerDB"
tables=df.select('target_object').filter(F.col('source') == db).distinct().toPandas()['target_object']
for tablename in list(tables):
if tablename != list(tables)[-1]:
vars()["%s_query"%db] = f" Select count_big(*) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename} \n" + " union \n" + vars()["%s_query"%db]
else:
vars()["%s_query"%db] = vars()["%s_query"%db] + f" Select count_big(*) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename}"
vars()["%s_DF"%db] = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )
# exec(f'{db}_DF = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )')
# print(tablename + " Loaded")
Getting Below error -
('42000', "[42000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Parse error at line: 3, column: 1: Incorrect syntax near 'union'. (103010) (SQLExecDirectW)")
I tried printing the SQL statements and it worked without any issue from SQL Server DB.
Please suggest where am I writing the code incorrectly.
Tried with the below code and it works. Thanks guys for the suggestions!
def get_table_count(query ,server, username, password, database):
jdbc_url = f"jdbc:sqlserver://{server}:1433;databaseName={database}"
df_read = spark.read \
.format("jdbc") \
.option("url",jdbc_url) \
.option("query", query) \
.option("user", username) \
.option("password", password) \
.option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
.load()
df_save = df_read.write.mode('overwrite').parquet('/tmp/' + f"{database}" + '.parquet')
df = spark.read.parquet('/tmp/' + f"{database}" + '.parquet')
return df
import pyspark.sql.functions as F
dbList = [ SQLServerDB1 , SQLServerDB1 ]
SQLServerDB1_query = ""
SQLServerDB2_query = ""
for db in dbList:
print("Currently loading for "+db+" database")
serverName = db + "SQLServerName"
serverUser = db + "SQLServerUser"
serverPassword = db + "SQLServerPassword"
serverDB = db + "SQLServerDB"
tables=df.select('target_object').filter(F.col('source') == db).distinct().toPandas()['target_object']
for tablename in list(tables):
if tablename != list(tables)[-1]:
vars()["%s_query"%db] = f" Select count_big(1) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename} \n" + " union \n" + vars()["%s_query"%db]
else:
vars()["%s_query"%db] = vars()["%s_query"%db] + f" Select count_big(1) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename}"
print(vars()["%s_query"%db])
vars()["%s_DF"%db] = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )
vars()["%s_DF"%db].createOrReplaceTempView(f"{db}_tablesCount")
print(f"{db}"+ " Loaded")

Read data from postgres DB in every 1 hour time interval in pyspark

I want to read data from postgres db of 1 hour time interval, I want the process to run every one hour. How can I do that? I have attached my code snippet. I am unable to use readstream for jdbc option.
df = spark.read \
.format("jdbc") \
.option("url", URL) \
.option("dbtable", "tagpool_with_tag_raw") \
.option("user", "tsdbadmin") \
.option("password", "cgqu5qss2zy3i1") \
.option("driver", "org.postgresql.Driver") \
.load()
# Getting the current date and time
dt = datetime.datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
utc_timestamp = utc_time.timestamp()
epoch = round(utc_timestamp / 60) * 60
# epoch = epoch+3600
print("epoch ", epoch)
df.createOrReplaceTempView("tagpool_with_tag_raw")
x = spark.sql("""select * from tagpool_with_tag_raw""")
x.show()
query = spark.sql("select * from tagpool_with_tag_raw WHERE input_time = " + str(epoch)) # .format()
# query = spark.sql("select CAST(input_time AS bigint), CAST(orig_time AS bigint) , from tagpool_with_tag_raw WHERE input_time = "+ epoch) #.format()
query.show()
# df.selectExpr(("SELECT * FROM public.tagpool_raw WHERE input_time<= %s".format(epoch)))
df.printSchema()
query.write \
.format("jdbc") \
.option("url", URL) \
.option("dbtable", "tagpool_tag_raw") \
.option("user", USER) \
.option("password", PW) \
.option("driver", DRIVER).save()
Readstream are not for jdbc , As jdbc is a batch operation, You will have to create a process just like what you have did and use schedulers like AutoSys or oozie or whatever your enterprise as to run every hour.

Trying to update or insert to SQL Server using pyodbc by iterating through Panda data frame

I am trying to update if record exist and insert if no record is found. using below code
for index, row in df.iterrows():
cols = "],[".join([str(i) for i in df.columns.tolist()])
cols = "([" + cols + "])"
ucols = "] = ?,[".join([str(i) for i in df.columns.tolist()])
ucols = "[" + ucols + "] = ?"
c.execute("SET TRANSACTION ISOLATION LEVEL SERIALIZABLE;")
c.execute("BEGIN TRANSACTION;")
c.execute("UPDATE " + tblname + " SET" + ucols + " WHERE [TESTNUMBER]=" + str(row['TESTNUMBER']) + " AND [ROWNUM] =" + str(row['ROWNUM']) + ";", tuple(row))
sqlr = "IF ##ROWCOUNT = 0 " \
"BEGIN " \
"INSERT INTO " + tblname + cols +" VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?); " \
"END " \
"COMMIT TRANSACTION;"
c.execute(sqlr, tuple(row))
getting below error message
{ProgrammingError}('25000', u'[25000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Transaction count after EXECUTE indicates a mismatching number of BEGIN and COMMIT statements. Previous count = 2, current count = 1. (266) (SQLExecDirectW)')
no sure what i am doing wrong. appreciate your help

Query Spotlight for a range of dates via PyObjC

I'm using spotlight via pyobjc. Which is working well except for when I try to limit the time period using kMDItemContentCreationDate. I think the issue is with my time format. Any help would be greatly appreciated.
from Cocoa import *
import sys
emails = [sys.argv[1], ]
predicate = "(kMDItemContentType = 'com.apple.mail.emlx') && (" + \
'||'.join(["((kMDItemAuthorEmailAddresses = '%s'))" % m for m in emails]) + \
"&& (kMDItemContentCreationDate > '2011-03-23 00:00:00')" + \
"&& (kMDItemContentCreationDate < '2012-03-24 00:00:00')" + \
")"
query = NSMetadataQuery.alloc().init()
query.setPredicate_(NSPredicate.predicateWithFormat_(predicate))
query.setSortDescriptors_(NSArray.arrayWithObject_(NSSortDescriptor.alloc().initWithKey_ascending_('kMDItemContentCreationDate', False)))
query.startQuery()
NSRunLoop.currentRunLoop().runUntilDate_(NSDate.dateWithTimeIntervalSinceNow_(5))
query.stopQuery()
results = query.results()[:5]
for item in results:
print "subject: ", item.valueForAttribute_("kMDItemSubject")

Categories

Resources