I working on extracting table count information from Azure SQL Server for over 350+ tables. As the system metadata tables are not regularly refreshed so I can't rely upon that. I written the below code to help me achieve the same -
import pyodbc
from pyspark.sql.types import *
pyodbc.pooling = False
def get_table_count(query ,server, username, password, database):
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = conn.cursor()
cursor.execute(query)
row = cursor.fetchone()
columns = StructType([StructField('tableCount', LongType(), True) , StructField('tableName', StringType(), True), StructField('databaseName', StringType(), True)])
data = [(row[0], row[1], row[2])]
df = spark.createDataFrame( data = data,schema = columns)
cursor.close()
del cursor
conn.close()
return df
import pyspark.sql.functions as F
dbList = [ SQLServerDB1 , SQLServerDB1 ]
SQLServerDB1_query = ""
SQLServerDB2_query = ""
for db in dbList:
print("Currently loading for "+db+" database")
serverName = db + "SQLServerName"
serverUser = db + "SQLServerUser"
serverPassword = db + "SQLServerPassword"
serverDB = db + "SQLServerDB"
tables=df.select('target_object').filter(F.col('source') == db).distinct().toPandas()['target_object']
for tablename in list(tables):
if tablename != list(tables)[-1]:
vars()["%s_query"%db] = f" Select count_big(*) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename} \n" + " union \n" + vars()["%s_query"%db]
else:
vars()["%s_query"%db] = vars()["%s_query"%db] + f" Select count_big(*) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename}"
vars()["%s_DF"%db] = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )
# exec(f'{db}_DF = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )')
# print(tablename + " Loaded")
Getting Below error -
('42000', "[42000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Parse error at line: 3, column: 1: Incorrect syntax near 'union'. (103010) (SQLExecDirectW)")
I tried printing the SQL statements and it worked without any issue from SQL Server DB.
Please suggest where am I writing the code incorrectly.
Tried with the below code and it works. Thanks guys for the suggestions!
def get_table_count(query ,server, username, password, database):
jdbc_url = f"jdbc:sqlserver://{server}:1433;databaseName={database}"
df_read = spark.read \
.format("jdbc") \
.option("url",jdbc_url) \
.option("query", query) \
.option("user", username) \
.option("password", password) \
.option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
.load()
df_save = df_read.write.mode('overwrite').parquet('/tmp/' + f"{database}" + '.parquet')
df = spark.read.parquet('/tmp/' + f"{database}" + '.parquet')
return df
import pyspark.sql.functions as F
dbList = [ SQLServerDB1 , SQLServerDB1 ]
SQLServerDB1_query = ""
SQLServerDB2_query = ""
for db in dbList:
print("Currently loading for "+db+" database")
serverName = db + "SQLServerName"
serverUser = db + "SQLServerUser"
serverPassword = db + "SQLServerPassword"
serverDB = db + "SQLServerDB"
tables=df.select('target_object').filter(F.col('source') == db).distinct().toPandas()['target_object']
for tablename in list(tables):
if tablename != list(tables)[-1]:
vars()["%s_query"%db] = f" Select count_big(1) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename} \n" + " union \n" + vars()["%s_query"%db]
else:
vars()["%s_query"%db] = vars()["%s_query"%db] + f" Select count_big(1) as tableCount, '{tablename}' as tableName, '{db}' as databaseName from " + f"{tablename}"
print(vars()["%s_query"%db])
vars()["%s_DF"%db] = get_table_count( vars()["%s_query"%db] , eval(serverName), eval(serverUser), eval(serverPassword), eval(serverDB) )
vars()["%s_DF"%db].createOrReplaceTempView(f"{db}_tablesCount")
print(f"{db}"+ " Loaded")
I want to read data from postgres db of 1 hour time interval, I want the process to run every one hour. How can I do that? I have attached my code snippet. I am unable to use readstream for jdbc option.
df = spark.read \
.format("jdbc") \
.option("url", URL) \
.option("dbtable", "tagpool_with_tag_raw") \
.option("user", "tsdbadmin") \
.option("password", "cgqu5qss2zy3i1") \
.option("driver", "org.postgresql.Driver") \
.load()
# Getting the current date and time
dt = datetime.datetime.now(timezone.utc)
utc_time = dt.replace(tzinfo=timezone.utc)
utc_timestamp = utc_time.timestamp()
epoch = round(utc_timestamp / 60) * 60
# epoch = epoch+3600
print("epoch ", epoch)
df.createOrReplaceTempView("tagpool_with_tag_raw")
x = spark.sql("""select * from tagpool_with_tag_raw""")
x.show()
query = spark.sql("select * from tagpool_with_tag_raw WHERE input_time = " + str(epoch)) # .format()
# query = spark.sql("select CAST(input_time AS bigint), CAST(orig_time AS bigint) , from tagpool_with_tag_raw WHERE input_time = "+ epoch) #.format()
query.show()
# df.selectExpr(("SELECT * FROM public.tagpool_raw WHERE input_time<= %s".format(epoch)))
df.printSchema()
query.write \
.format("jdbc") \
.option("url", URL) \
.option("dbtable", "tagpool_tag_raw") \
.option("user", USER) \
.option("password", PW) \
.option("driver", DRIVER).save()
Readstream are not for jdbc , As jdbc is a batch operation, You will have to create a process just like what you have did and use schedulers like AutoSys or oozie or whatever your enterprise as to run every hour.
Hello I'm trying to Oracle Partitioned table to Datalake parquet file.
Using this script
# Convert it to Spark SQL table and save it as parquet format
df.write \
.format("parquet") \
.option("path","/archive/" + schema_name + "/" + table_name + ".parquet") \
.mode("append") \
.saveAsTable(table_name)
This code get all data of table not partition.
spark = SparkSession.builder \
.appName("Load " + schema_name + " " + table_name + " from Oracle into Parquet and creating Table") \
.getOrCreate()
This one is Creating table from Oracle
How can i get this only parquet :)
you said when you select from the table you are getting data from the table and you want from a particular partition. Did you try putting the partition name using the syntax PARTION(partition_name).
How many partitions do you have, if they are not too many then you can try creating a view for each partition and then selecting data from the view.
Created table in oracle named Checkes
Add partition name to table
after I can read this partition name from Spark.
query = '(select partition_name from Schema.checkes c) checkes'
df = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
print("part count: " + str(df.count()))
if df.count() > 0:partition_name = df.select("partition_name").collect()[0]["partition_name"]
df1 = spark.read \
.format("jdbc") \
.option("url","jdbc:oracle:thin:#" + db_host + ":" + db_port + "/" + db_service) \
.option("dbtable",query1) \
.option("user",db_user) \
.option("password",db_pass) \
.option("driver","oracle.jdbc.OracleDriver") \
.option("encoding","UTF-8") \
.option("fetchSize", 10000) \
.option("numPartitions",40) \
.load()
```
I keep getting the error: "ValueError: too many values to unpack (expected 12)". I've checked the dataset, counted all the variables at least 10 times and I'm banging my head against the wall....What am I doing wrong?
Code (Python 3.7):
class Corona(object):
def __init__(self, FIPS, County, State, Country, Updated, Latitude, Longitude,
Confirmed, Deaths, Recovered, Active, Combined_Key):
self.FIPS = FIPS
self.County = County
self.State = State
self.Country = Country
self.Updated = Updated
self.Latitude = Latitude
self.Longitude = Longitude
self.Confirmed = Confirmed
self.Deaths = Deaths
self.Recovered = Recovered
self.Active = Active
self.Combined_Key = Combined_Key
def __str__(self):
return self.FIPS + "/" + \
self.County + "/" + \
self.State + "/" + \
self.Country + "/" + \
self.Updated + "/" + \
self.Latitude + "/" +\
self.Longitude + "/" + \
self.Confirmed + "/" + \
self.Deaths + "/" + \
self.Recovered + "/" + \
self.Active + "/" + \
self.Combined_Key
def loadPeople(filename, slist):
ifile = open(filename, 'r')
for line in ifile:
FIPS, \
County, \
State, \
Country, \
Updated, \
Latitude, \
Longitude, \
Confirmed, \
Deaths, \
Recovered, \
Active, \
Combined_Key, \
= line.split(',')
s = Corona(FIPS, County, State, Country, Updated, Latitude, Longitude,
Confirmed, Deaths, Recovered, Active, Combined_Key)
slist.append(s)
def main():
filename = "CoronaVirus.txt"
sick = []
loadPeople(filename, sick)
if __name__ == ("__main__"):
main()
First 3 lines of dataset
45001,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.22333378,-82.46170658,4,0,0,0,"Abbeville, South Carolina, US"
22001,Acadia,Louisiana,US,2020-04-01 21:58:49,30.295064899999996,-92.41419698,47,1,0,0,"Acadia, Louisiana, US"
51001,Accomack,Virginia,US,2020-04-01 21:58:49,37.76707161,-75.63234615,7,0,0,0,"Accomack, Virginia, US"
You have commas inside some of your quoted fields, so those fields are being split on the commas
s = '45001,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.22333378,-82.46170658,4,0,0,0,"Abbeville, South Carolina, US"'
len(s.split(','))
# 14
Instead of manually splitting the lines, use the csv module:
import csv
with ifile as open(filename):
reader = csv.reader(ifile)
for line in reader: # line is a list
slist.append(Corona(*line))
Actually from your example line.split(',') is not the right thing as your split split in 14 and not in 12. When you split in this way you are also splitting inside the double quotes chars. When you split the following line like this the length will be 14, which is not what you want.
45001,Abbeville,South Carolina,US,2020-04-01 21:58:49,34.22333378,-82.46170658,4,0,0,0,"Abbeville, South Carolina, US"
You can modify the definition of loadPeople with
def loadPeople(filename, slist):
ifile = open(filename, 'r')
for line in ifile:
FIPS, \
County, \
State, \
Country, \
Updated, \
Latitude, \
Longitude, \
Confirmed, \
Deaths, \
Recovered, \
Active, \
Combined_Key, \
= line.split('"')[0].split(',')[:-1]+[line.split('"')[1]]
s = Corona(FIPS, County, State, Country, Updated, Latitude, Longitude,
Confirmed, Deaths, Recovered, Active, Combined_Key)
slist.append(s)
Simple problem. but might be hard to eliminate. When you split the line by commas, you also split that last item which has the full location into three parts. Therefore, you get two extra values ("Abbeville, South Carolina, US") becomes three items. Try
for line in ifile:
FIPS, \
County, \
State, \
Country, \
Updated, \
Latitude, \
Longitude, \
Confirmed, \
Deaths, \
Recovered, \
Active, \
Combined_Key1, \
Combined_Key2, \
Combined_Key3 \
= line.split(',')
And then just:
Combined_Key = ",".join(Combined_Key1, Combined_Key2, Combnied_Key3)
Though since it seems the combined key is just the County, State, and Country joined together, you could also try just using those three joined as your combined key. Be warned though, if any of your lines has a differently formatted combined-key, you will need a more sophisticated approach.
Additionally, if you are sure this file is secure, you could just use csv to get a list of variables though it takes more time to learn.