I would like to use the date as an identification that will link two different tables together. I've searched a lot and found a few alternative solutions but I get error messages like this:
pyodbc.Error: ('21S01', '[21S01] [Microsoft][ODBC SQL Server Driver][SQL Server]There are more columns in the INSERT statement than values specified in the VALUES clause. The number of values in the VALUES clause must match the number of columns specified in the INSERT statement. (109) (SQLExecDirectW)')
This is the code I work with:
from src.server.connectToDB import get_sql_conn
import pandas as pd
from datetime import datetime
if __name__ == '__main__':
cursor = get_sql_conn().cursor()
localFile = 'C:\\Users\\dersimw\\Source\Repos\\nordpoolAnalyse\\data\\2011-3.xlsx'
excelFile = pd.ExcelFile(localFile)
rowsID = []
a = ["01"]
for sheets in a:
df = excelFile.parse(sheets).head(5)
df.dropna(axis=1, how='all', inplace=True)
df.fillna(0, inplace=True)
print(df)
now = datetime.now()
DateDateTime = now.strftime('?Y-?m-?d ?H:?M:?S')
for key, rows in df.items():
print("## Column: ", key, "\n")
columnInsertSql = "INSERT INTO Table11 (DateDateTime, AcceptedBlockBuy, AcceptedBlockSell, RejectedBlockBuy, RejectedBlockSell, NetImports) VALUES("
columnCounter = 1
columnHasData = False
for key, column in rows.items():
if isinstance(column, int) or isinstance(column, float):
columnHasData = True
columnInsertSql += str(column)
if columnCounter != len(list(rows.items())):
columnInsertSql += ", "
columnCounter += 1
columnInsertSql += ")"
if columnHasData == True:
cursor.execute(columnInsertSql)
cursor.commit()
This what I have:
Id A.BlockBuy A.BlockSell R.BlockBuy R.BlockSell NetImports
1 112 1 14 655 65
2 123 1 54 3 654
3 122 1 65 43 43
.
.
122 21 12 54 54 54
This is what I want:
Id DateDate A.BlockBuy A.BlockSell R.BlockBuy R.BlockSell NetImports
1 2018-08-1 112 1 14 655 65
2 2018-08-1 123 1 54 3 654
3 2018-08-1 122 1 65 43 43
.
.
122 2018-08-01 21 12 54 54 54
The way you are trying is not a good way to do etl. I have build my own package for one of my project using postgres and python. The procedure should be exactly same for SQL Server. You should add a datetime column to your data (etl_run_time ). I always add it to the dataframe/data before I upload into database. Then I can do bulk insert to the database.
The main thing is, The data loading to python and insertion to database should be separate task. Then there should be some update task if needed. I could not manage time to replicate exactly your task. But You can read in detail into the blog: https://datapsycho.github.io/PsychoBlog/dataparrot-18-01
# import datetime
import time
# from dateutil.relativedelta import relativedelta
import json
# that function has username and password for db connection
# you can create your own which will be used as cursor
from auths.auth import db_connection
import os
import pandas as pd
class DataLoader():
# function to process survey sent data
#classmethod
def process_survey_sent_data(cls):
# my file path is very big so I divide it to 3 different part
input_path_1 = r'path\to\your\file'
input_path_2 = r'\path\to\your\file'
input_path_3 = r'\path\to\your\file'
file_path = input_path_1 + input_path_2 + input_path_3
file_list = os.listdir(os.path.join(file_path))
file_list = [file_name for file_name in file_list if '.txt' in file_name]
field_names = ['country', 'ticket_id']
pd_list = []
for file_name in file_list:
# collecting file name to put them as column
date_ = file_name.replace(" ", "-")[:-4]
file_path_ = file_path + '\\' + file_name
df_ = pd.read_csv(os.path.join(file_path_), sep='\t', usecols=field_names).assign(sent_date=date_)
df_['sent_date'] = pd.to_datetime(df_['sent_date'])
df_['sent_date'] = df_['sent_date'].values.astype('datetime64[M]')
df_['sent_date'] = df_['sent_date'].astype(str)
pd_list.append(df_)
df_ = pd.concat(pd_list)
# doing few more cleaning
# creating a unique ID
df_ = df_[['country','sent_date', 'ticket_id']].groupby(['country','sent_date']).agg('count').reset_index()
df_['sent_id'] = df_['country'] + '_' + df_['sent_date']
df_.drop_duplicates(keep='first', subset='sent_id')
print(df_.head())
output_path_1 = r'\\long\output\path1'
output_path_2 = r'\lont\output\path2'
output_path = output_path_1 + output_path_2
# put the file name
survey_sent_file = 'survey_sent.json'
# add etl run time
df_['etl_run_time'] = pd.to_datetime('today').strftime('%Y-%m-%d')
# write file to json
df_.to_json(os.path.join(output_path, survey_sent_file), orient='records')
return print('Survey Sent data stored as json dump')
# function to crate a database insert query
#classmethod
def insert_string(cls, column_list,table_name):
# Uncomment the first part in the console
first_part = 'INSERT INTO {} VALUES ('.format(table_name)
second_part = ', '.join(['%({})s'.format(col) for col in column_list])
return first_part + second_part + ') ;'
# function to execute database query
#classmethod
def empty_table(cls, table_name):
conn = db_connection()
cursor = conn.cursor()
cursor.execute("delete from {} ;".format(table_name))
conn.commit()
conn.close()
# #function to run post post_sql code after the data load
# #classmethod
# def run_post_sql(cls):
# # create a database query which can run after the insertation of data
# post_sql = """
# INSERT INTO schema.target_table -- target
# select * from schema.extract_table -- extract
# WHERE
# schema.extract_table.sent_id -- primary key of extract
# NOT IN (SELECT DISTINCT sent_id FROM schema.target_table) -- not in target
# """
# conn = db_connection()
# cursor = conn.cursor()
# cursor.execute(post_sql)
# conn.commit()
# conn.close()
# return print("Post SQL for servey sent has run for Survey Sent.")
# function to insert data to server
#classmethod
def insert_survey_sent_data(cls):
output_path_1 = r'new\json\file\path1'
output_path_2 = r'\new\json\file\path2'
output_path = output_path_1 + output_path_2
## create file
output_survey_file = 'survey_sent.json'
full_path = os.path.join(output_path, output_survey_file)
# column name from the json file
table_def = ['sent_id','country', 'ticket_id', 'sent_date', 'etl_run_time']
# load the data as json and partitioning
with open(full_path, 'r') as file:
chunk = 60
json_data = json.loads(file.read())
json_data = [json_data[i * chunk:(i + 1) * chunk] for i in range((len(json_data) + chunk - 1) // chunk )]
# create connection delete existing data and insert data
table_name = 'schema.extract_table'
cls.empty_table(table_name)
print('{} total chunk will be inserted, each chunk have {} rows.'.format(len(json_data), chunk))
for iteration, chunk_ in enumerate(json_data, 1):
conn = db_connection()
cursor = conn.cursor()
insert_statement = cls.insert_string(table_def, table_name)
start_time = time.time()
cursor.executemany(insert_statement, chunk_)
conn.commit()
conn.close()
print(iteration, " %s seconds" % round((time.time() - start_time), 2))
return print('Insert happened for survey sent.')
if __name__ == "__main__":
DataLoader.process_survey_sent_data()
DataLoader.insert_survey_sent_data()
# DataLoader.run_post_sql()
Related
I have this problem, because I am measuring the temperature of my room with a serial arduino.
I also have a db of plants in azure, that could grow in x temperatures.
I want to check if the temerature on my room is okay for the plants i have on my db to grow, but i dont know how to send this temperature value to azure and check the temperature column.
Any advice?
Thanks
At the moment I thout about having two files, one in azure (i dont know how to upload it to azure) and one in my local pc
The one on the cloud:
from arduinodb_pc import *
import pyodbc
import time
server = '...'
database = '...'
username = '...'
password = '...'
driver= '{ODBC Driver 17 for SQL Server}'
conn = pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+server+';PORT=1433;DATABASE='+database+';UID='+username+';PWD='+ password)
cursor = conn.cursor()
def db_check_temperature_cold(farenh):
cursor.execute(f"SELECT * FROM my_plants WHERE minTinF >= {farenh};") # Send query and execute
# row = [id, Name, ..., maxTinF]
row = cursor.fetchone() # Move to the next row;
list_of_names = []
while row:
list_of_names.append(row)
row = cursor.fetchone() # Move to the next row
return list_of_names
def db_check_temperature_hot(farenh):
cursor.execute(f"SELECT * FROM my_plants WHERE maxTinF <= {farenh};") # Send query and execute
# row = [id, Name, ..., maxTinF]
row = cursor.fetchone() # Move to the next row;
list_of_names = []
while row:
list_of_names.append(row)
row = cursor.fetchone() # Move to the next row
return list_of_names
currentTinF =0
currentTinC=0
text_currentT=0
while True:
currentTinF, currentTinC, text_currentT = update_T()
too_cold = db_check_temperature_cold(currentTinF)
too_hot = db_check_temperature_hot(currentTinF)
if (too_cold): arduino_write_cold(arduino)
if (too_hot): arduino_write_hot(arduino)
time.sleep(15) # but every 10 minutes
The code in my local pc:
import datetime
import time
from arduinodb_cloud import *
import serial
arduino = serial.Serial(port='COM3', baudrate=115200, timeout=.1)
def update_T():
"""
Updates the label where the time and temperature are shown
"""
return arduino_read_T(arduino, -1)
def arduino_read_T(arduino, unit):
"""
In this function, the current temperature will be read from the aruino.
:param arduino: HW it is going to use for checking the T
:param unit: If the user prefers ºF or ºC
:return: f, c: current temerature in farenheit and celisus
txt_currentT: Text that will be shown on the gui
"""
global fbefore
global cbefore
x = datetime.datetime.now()
arduino.write(bytes("readT", 'utf-8'))
time.sleep(0.05)
value = arduino.readline()
try:
value = str(value).lstrip("b'")
value = str(value).rstrip("'")
f, c = str(value).split(",")
except:
try:
f = fbefore
c = cbefore
except:
f=65.3
c=18.5
fbefore=f
cbefore=c
if unit == 1: txt_currentT = (f"Today, {x.day}/{x.month}/{x.year} at {x.hour}:{x.minute}:{x.second} it's {f}ºF")
elif unit == 0: txt_currentT = (f"Today, {x.day}/{x.month}/{x.year} at {x.hour}:{x.minute}:{x.second} it's {c}ºC")
else: txt_currentT = ""
return float(f), float(c), txt_currentT
def arduino_write_cold(arduino):
arduino.write(bytes("cold", 'utf-8'))
def arduino_write_hot(arduino):
arduino.write(bytes("hot", 'utf-8'))
I have a program I use to pull survey data from my site. I have it written in Python and it will take the info from the sql database, and download it into a csv file. It worked great, but out of nowhere it now is just a blank csv file with no information and has no size. Below is the program I use for the pull. I use python 3.7.4 right now.
import pymysql.cursors
import csv
import sshtunnel
from sshtunnel import SSHTunnelForwarder
server = SSHTunnelForwarder(
('ADDRESS', PORT),
ssh_username='USERNAME',
ssh_password='PASS',
remote_bind_address=('127.0.0.1', 3306)
)
server.start()
def find_number(result, number):
if str(number) in result['p4'].split(','):
return 1
if str(number) in result['p3'].split(','):
return 2
if str(number) in result['p2'].split(','):
return 3
if str(number) in result['p1'].split(','):
return 4
if str(number) in result['z'].split(','):
return 5
if str(number) in result['n1'].split(','):
return 6
if str(number) in result['n2'].split(','):
return 7
if str(number) in result['n3'].split(','):
return 8
if str(number) in result['n4'].split(','):
return 9
return 'not found: ' + str(number)
# Connect to the database
conn = pymysql.connect(host='127.0.0.1',
user='USER',
password='PASS',
db='DBNAME',
charset='utf8mb4',
port=server.local_bind_port,
cursorclass=pymysql.cursors.DictCursor)
cursor = conn.cursor()
sql = "SELECT * FROM `surveys` order by id asc"
cursor.execute(sql)
results = cursor.fetchall()
output = {}
headers = []
comments = []
for result in results:
headers.append(str(result['gender']) + '/' + str(result['birth_year']) + '/' + str(result['political_affiliation']) + '/' + str(result['religious_belief']) + '/' + str(result['ethnicity']) + '/' + str(result['location']))
for nr in range(1, 37):
if not nr in output:
output[nr] = []
nrr = find_number(result, nr)
output[nr].append(nrr)
comments.append(result['comments'])
writer = csv.writer(open("output5.csv", 'w', newline=''))
writer.writerow(headers)
for row in output:
writer.writerow(output[row])
writer.writerow(comments)
conn.close()
It could be that results is returning a falsely value causing your for loop to be skipped. Have you tried adding a check around if there aren't any results?
...
results = cursor.fetchall()
if not results:
raise RuntimeException('Cannot retrieve records from Database!')
Also, not sure if you know of defaultdict but you could also use it in your snippet to simplify a bit.
from collections import defaultdict
...
output = defaultdict(list)
for result in results:
...
for nr in range(1, 37):
output[nr].append(find_number(result, nr))
I have been told I need to use a mapping table instead of a hardcoded dictionary I made. There has to be a better method than the code below to get a 3 column table into a dictionary?
Mapping table
AgentGrp, Property, TimeZone #Headers
HollyWoodAgent Sunset PST
UnionSquareAgent UnionSquare PST
Turns into the following dictionary:
{'HollyWoodAgent': ['Sunset', 'PST'], 'UnionSquareAgent': ['UnionSquare', 'PST']}
Code:
import pandas as pd
import pyodbc
import datetime
import sys
import csv
VipAgent = "{"
finalSql = "SELECT agentgrp, property, timezone FROM sandbox_dev.agentgrp_map;"
colcnt = 0
try:
conn = pyodbc.connect("DSN=Dev")
cursor = conn.cursor()
cursor.execute(finalSql)
for row in cursor.fetchall():
VipAgent += "'" + row.prop + "VipAgent':['" + row.prop + "','" + row.tz + "'],"
colcnt = colcnt + 1
if(colcnt==3):
VipAgent = VipAgent + "\n"
colcnt = 0
except my.Error as e:
print(e)
VipAgent = VipAgent[:-1] + "}"
Dict = eval(VipAgent)
print(Dict)
I do get the values as expected. There has to be a better python way out there.
We'll take it as given that you read the "mapping table" from a file into a Python list similar to this one
item_map = ['AgentGrp', 'Property', 'TimeZone']
Once you've executed your SELECT query
cursor.execute(finalSql)
then you can build your dict like so:
result_dict = {}
while (True):
row = cursor.fetchone()
if (row):
result_dict[row.__getattribute__(item_map[0])] = \
[row.__getattribute__(x) for x in item_map[1:]]
else:
break
print(result_dict)
# {'HollyWoodAgent': ['Sunset', 'PST'], 'UnionSquareAgent': ['UnionSquare', 'PST']}
The trick is to use row.__getattribute__, e.g., row.__getattribute__('column_name') instead of hard-coding row.column_name.
My usecase is to write create a temp table in the postgres database and fetch records from it and insert into a different table.
The code i used is:
import psycopg2
import sys
import pprint
from __future__ import print_function
from os.path import join,dirname,abspath
import xlrd
import os.path
newlist = []
itemidlist = []
def main():
conn_string = "host='prod-dump.cvv9i14mrv4k.us-east-1.rds.amazonaws.com' dbname='ebdb' user='ebroot' password='*********'"
# print the connection string we will use to connect
# print "Connecting to database" % (conn_string)
# get a connection, if a connect cannot be made an exception will be raised here
conn = psycopg2.connect(conn_string)
# conn.cursor will return a cursor object, you can use this cursor to perform queries
cursor = conn.cursor()
dealer_id = input("Please enter dealer_id: ")
group_id = input("Please enter group_id: ")
scriptpath = os.path.dirname('__file__')
filename = os.path.join(scriptpath, 'Winco - Gusti.xlsx')
xl_workbook = xlrd.open_workbook(filename, "rb")
xl_sheet = xl_workbook.sheet_by_index(0)
print('Sheet Name: %s' % xl_sheet.name)
row=xl_sheet.row(0)
from xlrd.sheet import ctype_text
print('(Column #) type:value')
for idx, cell_obj in enumerate(row):
cell_type_str = ctype_text.get(cell_obj.ctype, 'unknown type')
#print('(%s) %s %s' % (idx, cell_type_str, cell_obj.value))
num_cols = xl_sheet.ncols
for row_idx in range(0, xl_sheet.nrows): # Iterate through rows
num_cols = xl_sheet.ncols
id_obj = xl_sheet.cell(row_idx, 1) # Get cell object by row, col
itemid = id_obj.value
#if itemid not in itemidlist:
itemidlist.append(itemid)
# execute our Query
'''
cursor.execute("""
if not exists(SELECT 1 FROM model_enable AS c WHERE c.name = %s);
BEGIN;
INSERT INTO model_enable (name) VALUES (%s)
END;
""" %(itemid,itemid))
'''
cursor.execute("drop table temp_mbp1")
try:
cursor.execute("SELECT p.model_no, pc.id as PCid, g.id AS GROUPid into public.temp_mbp1 FROM products p, \
model_enable me, products_clients pc, groups g WHERE p.model_no = me.name \
and p.id = pc.product_id and pc.client_id = %s and pc.client_id = g.client_id and g.id = %s"\
% (dealer_id,group_id)
except (Exception, psycopg2.DatabaseError) as error:
print(error)
cursor.execute("select count(*) from public.temp_mbp1")
# retrieve the records from the database
records = cursor.fetchall()
# print out the records using pretty print
# note that the NAMES of the columns are not shown, instead just indexes.
# for most people this isn't very useful so we'll show you how to return
# columns as a dictionary (hash) in the next example.
pprint.pprint(records)
if __name__ == "__main__":
main()
The try except block in between the program is not throwing any error but the table is not getting created in the postgres database as i see in the data admin.
The output shown is:
Please enter dealer_id: 90
Please enter group_id: 13
Sheet Name: Winco Full 8_15_17
(Column #) type:value
[(3263,)]
Thanks,
Santosh
You didn't commit the changes, so they aren't saved in the database. Add to the bottom, just below the pprint statement:
conn.commit()
My second data frame is not loading values when i create it. Any help with why it is not working? When i make my cursor a list, it has a bunch of values in it, but for whatever reason when i try to do a normal data frame load with pandas a second time, it does not work.
My code:
conn = pyodbc.connect(constr, autocommit=True)
cursor = conn.cursor()
secondCheckList = []
checkCount = 0
maxValue = 0
strsql = "SELECT * FROM CRMCSVFILE"
cursor = cursor.execute(strsql)
cols = []
SQLupdateNewIdField = "UPDATE CRMCSVFILE SET NEW_ID = ? WHERE Email_Address_Txt = ? OR TELEPHONE_NUM = ? OR DRIVER_LICENSE_NUM = ?"
for row in cursor.description:
cols.append(row[0])
df = pd.DataFrame.from_records(cursor)
df.columns = cols
newIdInt = 1
for row in range(len(df['Email_Address_Txt'])):
#run initial search to figure out the max number of records. Look for email, phone, and drivers license, names have a chance not to be unique
SQLrecordCheck = "SELECT * FROM CRMCSVFILE WHERE Email_Address_Txt = '" + str(df['Email_Address_Txt'][row]) + "' OR TELEPHONE_NUM = '" + str(df['Telephone_Num'][row]) + "' OR DRIVER_LICENSE_NUM = '" + str(df['Driver_License_Num'][row]) + "'"
## print(SQLrecordCheck)
cursor = cursor.execute(SQLrecordCheck)
## maxValue is indeed a list filled with records
maxValue =(list(cursor))
## THIS IS WHERE PROBLEM OCCURS
tempdf = pd.DataFrame.from_records(cursor)
Why not just use pd.read_sql_query("your_query", conn) this will return the result of the query as a dataframe and requires less code. Also you set cursor to cursor.execute(strsql) at the top and then you are trying to call execute on cursor again in your for loop but you can no longer call execute on cursor you will have to set cursor = conn.cursor() again.