JSON serializable error - Automatically update data to Google sheet - python

I would like to use python to connect data from Excel to Google Sheet.
But I got error "TypeError: Object of type datetime is not JSON serializable"
I guess the error happened in line with "wks.update_cells(cell_list)". May I know how to solve this kind of error? Thank you
import os
import pandas as pd
import glob
import datetime
import numpy as np
import time
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import json
def numberToLetters(q):
q = q - 1
result = ''
while q >= 0:
remain = q % 26
result = chr(remain+65) + result;
q = q//26 - 1
return result
current_path = os.getcwd()
raw_df = pd.read_excel(glob.glob(os.path.join(current_path , 'Studio*'))[0],sheet_name = "Dashboard", encoding = "ISO-8859-1")
df = raw_df.replace(np.nan, '', regex=True)
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('startup_funding.json', scope)
gc = gspread.authorize(credentials)
wks = gc.open("Testing_Dashboard_Upload").sheet1
wks.clear()
columns = df.columns.values.tolist()
# selection of the range that will be updated
cell_list = wks.range('A1:'+numberToLetters(len(columns))+'1')
# modifying the values in the range
for cell in cell_list:
val = columns[cell.col-1]
# if type(val) is str:
# val = val.decode('utf-8')
cell.value = val
# update in batch
wks.update_cells(cell_list)
#4. handle content
# number of lines and columns
num_lines, num_columns = df.shape
# selection of the range that will be updated
cell_list = wks.range('A2:'+numberToLetters(num_columns)+str(num_lines+1))
# modifying the values in the range
for cell in cell_list:
val = df.iloc[cell.row-2,cell.col-1]
if isinstance(val, (np.float64, np.int64, np.bool_ )):
# val = np.asscalar(val)
# DeprecationWarning: np.asscalar(a) is deprecated since NumPy v1.16, use a.item() instead
val = val.item()
cell.value = val
#5. update in batch
wks.update_cells(cell_list)

Python json library wont serialize datetime object. You have to do it yourself. Find out which value in cell_list is datetime, and convert it to string using strftime method. From your code I think you are setting cell.value to a datetime object. If that so, you may change the line
cell.value = val
to
if isinstance(val, datetime.datetime):
val = val.strftime("%m/%d/%Y, %H:%M:%S")
cell.value = val

Related

keep calling an API until it is updated with latest item (Python)

I'm looking to call an API, and compare the data to my saved data in a CSV. If it has a new data point then I want to update my CSV and return the DataFrame... The mystery I have is why these two variables appear to be the same, yet the If statement moves to the Else instead of recognizing they are the same, if they are the same it should keep looping until an updated data point appears,(see second_cell == lastItem1 )
import pandas_datareader as pdr # https://medium.com/swlh/pandas-datareader-federal-reserve-economic-data-fred-a360c5795013
import datetime
def datagetter():
i = 1
while i < 120:
start = datetime.datetime (2005, 1, 1) ### Step 1: get data, and print last item
end = datetime.datetime (2040, 1, 1)
df = pdr.DataReader('PAYEMS', 'fred', start, end) ## This is the API
lastItem1 = df["PAYEMS"].iloc[-1] # find the last item in the data we have just downloaded
print ("Latest item from Fred API: " , lastItem1) ### Print the last item
with open('PAYEMS.csv', 'r') as logs: # So first we open the most recent CSV file
data = logs.readlines()
last_row = data[-1].split(',') # split is default on , as CSVs should be.
second_cell = last_row[1] # "second_cell" is our variable name for the saved datapoint from last month/week/day
print ("Last Item, in thousands" , second_cell)
if second_cell == lastItem1:
print ("CSV " , second_cell, "API ", lastItem1, " downloaded and stored items are the same, will re-loop until a new datapoint")
print("attempt no.", i)
i += 1
else:
df.to_csv("PAYEMS.csv")
print ("returning dataframe")
# print(df.tail())
return df
df = datagetter()
print(df.tail(3))
solved my own problem:
my CSV was returning a string, and the API an int... not quite sure why.
So
if second_cell == "": second_cell = 0 second_cell1 = int(float(second_cell))

Alphabetically ordering Worksheets using openpxl in WriteOnlyWorkSheet mode

Background:I have created a script that takes a .xlsx file and splits the Groupings of data into separate worksheets, based on the names of each Grouping. E.g., Client A Grouped data will be written to a worksheet named Client A and so forth.Additionally, the script strips out Excel-invalid characters, and miscellaneous strings contained within INVALID_TITLE_CHARS and INVALID_TITLE_NAMES as well as uses values in management_style and model_type to decide which rows are irrelevant and therefore skipped, during the copy process.
My issue:I would like to expand the script so that the Worksheets are alphabetical. E.g., 'Client A, Client B, Client C, and so forth. I have tried to achieve this by client_sheet._sheets.sort(key=lambda output_workbook: output_workbook.title) however, for memory consistency reasons, I have used WriteOnlyWorksheet which seems incompatible with _sheets.Does anyone know if there is another solution? I was hoping to avoid creating a function that reopens the.xlsx
The script:This is the full script:
import time
import logging
import sys
import datetime as dt
import datetime
import requests as requests
import json
import enlighten
import warnings
from openpyxl import load_workbook
from openpyxl import LXML
from openpyxl.cell import WriteOnlyCell
from openpyxl import Workbook
from copy import copy
from configparser import ConfigParser
from requests.auth import HTTPBasicAuth
logger = logging.getLogger()
timestr = datetime.datetime.now().strftime("%Y-%m-%d")
warnings.filterwarnings("ignore")
def configure_logging():
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
formatter = logging.Formatter("%(asctime)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
INVALID_TITLE_CHARS = ["]", "[", "*", ":", "?", "/", "\\", "'"]
INVALID_TITLE_CHAR_MAP = {ord(x): "" for x in INVALID_TITLE_CHARS}
INVALID_TITLE_NAMES = ["zz_ FeeRelationship", "Family"]
def clean_sheet_title(title):
title = title or ""
title = title.strip()
title = title.translate(INVALID_TITLE_CHAR_MAP)
for name in INVALID_TITLE_NAMES:
title = title.replace(name, "")
return title[:31]
def is_client_row(row, row_dimension):
return row_dimension.outlineLevel == 0
def create_write_only_cell(source_cell, target_sheet):
target_cell = WriteOnlyCell(target_sheet, value=source_cell.value)
target_cell.data_type = source_cell.data_type
if source_cell.has_style:
target_cell.font = copy(source_cell.font)
# target_cell.border = copy(source_cell.border)
# target_cell.fill = copy(source_cell.fill)
target_cell.number_format = copy(source_cell.number_format)
# target_cell.protection = copy(source_cell.protection)
target_cell.alignment = copy(source_cell.alignment)
return target_cell
def create_write_only_row(source_row, target_sheet):
return [create_write_only_cell(cell, target_sheet) for cell in source_row]
def skip_row(row, row_dimension):
"""
Determine whether a row needs to be skipped and not copied to new workbook
"""
def get_column_value(column):
value = row[column].value or ""
return value.strip()
# skip total line
if row[0].value == "Total":
return True
management_style = [
"Advisory",
"Advisory - No Fee",
"Holding",
"JPAS",
"Liquidity Management",
"Trading",
"",
]
model_type = ["Client", "Holding Account", "Holding Company", "Trust", ""]
management_value = get_column_value(3)
model_value = get_column_value(11)
# Pass on either column
return management_value not in management_style and model_value not in model_type
# # Pass on both columns
# return management_value not in management_style or model_value not in model_type
def split_workbook(input_file, output_file):
"""
Split workbook each client into its own sheet.
"""
try:
logger.info(f"Loading workbook {input_file}")
workbook = load_workbook(input_file)
data_sheet = workbook.active
output_workbook = Workbook(write_only=True)
client_sheet = None
client_row_index = 2
processing_client = 0
skip_child = False
skipped_parent_outline_level = 0
skipped_rows_per_client = 0
rows = data_sheet.rows
header = next(rows)
for index, row in enumerate(rows, start=2):
row_dimension = data_sheet.row_dimensions[index]
# verify whether current row is a child of skipped parent
if skip_child and skipped_parent_outline_level < row_dimension.outlineLevel:
skipped_rows_per_client += 1
continue
# reset skip_child when current row is not a child of skipped parent anymore
if (
skip_child
and skipped_parent_outline_level >= row_dimension.outlineLevel
):
skip_child = False
# check whether row needs to be skipped
if skip_row(row, row_dimension):
skipped_rows_per_client += 1
skip_child = True
skipped_parent_outline_level = row_dimension.outlineLevel
continue
# create new sheet found new client is found
if is_client_row(row, row_dimension):
skipped_rows_per_client = 0
processing_client += 1
client_sheet_title = clean_sheet_title(row[0].value)
logger.info(f"Processing client {processing_client}")
client_sheet = output_workbook.create_sheet(client_sheet_title)
client_row_index = index
# copy column dimensions
for key, column_dimension in data_sheet.column_dimensions.items():
client_sheet.column_dimensions[key] = copy(column_dimension)
client_sheet.column_dimensions[key].worksheet = client_sheet
client_sheet.append(create_write_only_row(header, client_sheet))
# copy row dimensions
new_row_index = index - skipped_rows_per_client - client_row_index + 2
client_sheet.row_dimensions[new_row_index] = copy(row_dimension)
client_sheet.row_dimensions[new_row_index].worksheet = client_sheet
# finally copy row
client_sheet.append(create_write_only_row(row, client_sheet))
if index % 10000 == 0:
logger.info(f"{index} rows processed")
logger.info(f"Writing workbook {output_file}")
output_workbook.save(output_file)
finally:
if workbook:
workbook.close()
if output_workbook:
output_workbook.close()
if __name__ == "__main__":
start = time.time()
configure_logging()
input_file = 'Input_File'+timestr+'.xlsx'
output_file = 'Output_File'+timestr+'.xlsx'
logger.info(f"Using lxml mode: {LXML}")
split_workbook(input_file, output_file)
logger.info("Time consumed: % s seconds" % (time.time() - start))
I am able to achieve this using _sheets.sort() the titles. Here is an example to demonstrate the same...
import openpyxl
from openpyxl.cell import WriteOnlyCell
#Create workbook in write_only mode as you did
output_workbook = Workbook(write_only=True)
#Add 4 sheets
ws1 = output_workbook.create_sheet("clientB_sheet_title")
ws2 = output_workbook.create_sheet("clientA_sheet_title")
ws3 = output_workbook.create_sheet("clientD_sheet_title")
ws4 = output_workbook.create_sheet("clientC_sheet_title")
#Add some data at start
cell = WriteOnlyCell(ws1, value="hello clientB")
ws1.append([cell, 3.14, None])
cell = WriteOnlyCell(ws2, value="hello clientA")
ws2.append([cell, 3.14, None])
cell = WriteOnlyCell(ws3, value="hello clientD")
ws3.append([cell, 3.14, None])
cell = WriteOnlyCell(ws4, value="hello clientC")
ws4.append([cell, 3.14, None])
### The key to your question - The sorting of titles ###
output_workbook._sheets.sort(key=lambda ws: ws.title)
#Finally save
output_workbook.save("output_file.xlsx")
Output excel
OPTION - 2 (better and safer option) using move_sheet()
As per Charlie Clark's recommendation, the better and safer option is to use move_sheets(). The same has been added. Note that I am only including the code that will replace the last two commands (_sheet and save). Results are the same...
asc_sheetlist = output_workbook.sheetnames
asc_sheetlist.sort()
for pos, name in enumerate(asc_sheetlist):
output_workbook.move_sheet(name, pos - output_workbook.sheetnames.index(name))
output_workbook.save("output_file.xlsx")
Based on Redox's answer, I was able to throw together a super-simple function to achieve this -
def alphabetical_client_sheet_sort(output_file):
workbook = load_workbook(output_file)
workbook._sheets.sort(key=lambda output_file: output_file.title)
workbook.save(output_file)
workbook.close()

TypeError: can only concatenate str (not "bytes") to str - decrypting from pandas

I'm trying to decrypt my encrypted data stored in a text file from my DropBox bucket. When I
create a pandas dataframe of the txt data and match it up with my user input, I exclusively pick the row that matches. Using this data I run it through my decryption (AES_CBC).
When I run I get the error: "TypeError: can only concatenate str (not "bytes") to str"
The error appears from my padding function (p4 = padded_text(df1))
I'm not sure if this is an encoding issue, or if the fact I'm using .get() as my user input and because that's a string it isn't working. Any help would be greatly appreciated.
def login():
import sqlite3
import os.path
def decoder():
from Crypto.Cipher import AES
import hashlib
from secrets import token_bytes
cursor.execute(
'''
Select enc_key FROM Login where ID = (?);
''',
(L_ID_entry.get(), ))
row = cursor.fetchone()
if row is not None:
keys = row[0]
import pyDes
#design padding function for encryption
def padded_text(data_in):
while len(data_in)% 16 != 0:
data_in = data_in + b"0"
return data_in
#calling stored key from main file and reverting back to bytes
key_original = bytes.fromhex(keys)
mode = AES.MODE_CBC
#model
cipher = AES.new(key_original, mode, IV3)
#padding data
p4 = padded_text(df1)
p5 = padded_text(df2)
p6 = padded_text(df3)
#decrypting data
d_fname = cipher.decrypt(p4)
d_sname = cipher.decrypt(p5)
d_email = cipher.decrypt(p6)
#connecting to db
try:
conn = sqlite3.connect('login_details.db')
cursor = conn.cursor()
print("Connected to SQLite")
except sqlite3.Error as error:
print("Failure, error: ", error)
finally:
#downloading txt from dropbox and converting to dataframe to operate on
import New_user
_, res = client.files_download("/user_details/enc_logins.txt")
with io.BytesIO(res.content) as csvfile:
df = pd.read_csv(csvfile, sep=';', names=['ID', 'Fname', 'Sname', 'Email'])
newdf = df.loc[df['ID'] == L_ID_entry.get()]
print(newdf)
df1 = newdf['Fname']
df2 = newdf['Sname']
df3 = newdf['Email']
csvfile.close()
decoder()

Decryption not working - how to get raw data from csv/pandas - python

Below is my code for decrypting from a csv file stored on DropBox. I get the user to type in their ID, I match this with a database containing hashed values, and then I use the ID typed in to search my stored csv file for the matching row. I then place all the row values into my decryption function.
Also im aware my variable names/formatting is awful im just using this code as a prototype as of right now.
My results are being printed as such:
b'b\xebS\x1b\xc8v\xe2\xf8\xa2\\x84\x0e7M~\x1b'
b'\x01B#6i\x1b\xfc]\xc3\xca{\xd5{B\xbe!'
b'in*V7\xf3P\xa0\xb2\xc5\xd2\xb7\x1dz~\x95'
I store my key and IV so they are always the same, yet the decryption doesnt seem to work. My only thinking is perhaps my data is changed somehow when stored in a csv or pandas table etc. does anyone know what the issue would be or if the bytes can be altered when stored/imported to dataframe?
also maybe i am extracting the data from my csv to pandas incorrectly?
def login():
import sqlite3
import os.path
def decoder():
from Crypto.Cipher import AES
import hashlib
from secrets import token_bytes
cursor.execute(
'''
Select enc_key FROM Login where ID = (?);
''',
(L_ID_entry.get(), ))
row = cursor.fetchone()
if row is not None:
keys = row[0]
#design padding function for encryption
def padded_text(data_in):
while len(data_in)% 16 != 0:
data_in = data_in + b"0"
return data_in
#calling stored key from main file and reverting back to bytes
key_original = bytes.fromhex(keys)
mode = AES.MODE_CBC
#model
cipher = AES.new(key_original, mode, IV3)
#padding data
p4 = padded_text(df1.tobytes())
p5 = padded_text(df2.tobytes())
p6 = padded_text(df3.tobytes())
#decrypting data
d_fname = cipher.decrypt(p4)
d_sname = cipher.decrypt(p5)
d_email = cipher.decrypt(p6)
print(d_fname)
print(d_sname)
print(d_email)
#connecting to db
try:
conn = sqlite3.connect('login_details.db')
cursor = conn.cursor()
print("Connected to SQLite")
except sqlite3.Error as error:
print("Failure, error: ", error)
finally:
#downloading txt from dropbox and converting to dataframe to operate on
import New_user
import ast
_, res = client.files_download("/user_details/enc_logins.csv")
with io.BytesIO(res.content) as csvfile:
with open("enc_logins.csv", 'rb'):
df = pd.read_csv(csvfile, names=['ID', 'Fname', 'Sname', 'Email'], encoding= 'unicode_escape')
newdf = df[(df == L_ID_entry.get()).any(axis=1)]
print(newdf)
df1 = newdf['Fname'].to_numpy()
df2 = newdf['Sname'].to_numpy()
df3 = newdf['Email'].to_numpy()
print(df1)
print(df2)
print(df3)
csvfile.close()
decoder()

IndexError: single positional indexer is out-of-bounds error while downloading data

I am running a code to download data and saving them in local drive. However, I am getting above mentioned error message. Please note initially I have converted date in a different format and while saving them I get this error message.
Can you please help me with this error?
'''
import quandl
import os
import pandas as pd
import datetime as dt
import glob
if name == "main":
'Creating bucket to store missing data file.'
data_missing = []
New_date = []
'Defining a path to save CSV files after downloading and also deleting all csv file at one go.'
extension = 'csv'
path = "F:/Tradepoint/MyMkt/"
if not os.path.exists(path):
os.mkdir(path)
os.chdir(path)
csv_count = [forma for forma in glob.glob('*.{}'.format(extension))]
for csv_coun in range(len(csv_count)):
os.remove(r"F:/Tradepoint/MyMkt/" + csv_count[csv_coun][0:])
'Setting up quandl configuration, reading ticker list, setting up date for which data is going to get downloaded'
quandl.ApiConfig.api_key = 'Hba3CzgNnEa2LMxR14FA'
end_date = dt.date.today()
diff_year = dt.timedelta(days=3650)
start_date = end_date - diff_year
stock_list = pd.read_csv(r"F:\Abhay_New\Abhay\Python\Project\SHARADAR_SF1.csv")
'Looping through quandl website to download data and renaming them as per requirement.'
for stock_lis in range(len(stock_list)):
data = quandl.get_table('SHARADAR/SEP', date={'gte':start_date, 'lte':end_date}, ticker=stock_list.iloc[stock_lis])
sort_by_date = data.sort_values('date')
for sort_by_dat in range(len(sort_by_date['date'])):
Date = dt.date.strftime(sort_by_date['date'][sort_by_dat],'%d-%m-%Y')
New_date.append(Date)
if len(data)>1:
Date = pd.Series(New_date).rename('Date').astype(str)
OPEN = sort_by_date['open']
HIGH = sort_by_date['high']
LOW = sort_by_date['low']
CLOSE = sort_by_date['close']
VOLUME = sort_by_date['volume']
final_data = pd.concat([Date,OPEN,HIGH,LOW,CLOSE,VOLUME],axis=1)
stk = stock_list.iloc[sort_by_dat][0]
final_data.to_csv(str(path + stk + '.csv'), sep=',', index = False, header = False)
else:
data_missing.append(stock_list.iloc[sort_by_dat])
print(data_missing)
'''
Thanks,
Abhay Dodiya
The index of both for loops is i. This causes potentially unintended behavior:
for i in range(2):
for i in range(3,11):
pass
print(i)
gives
10
10
So even after exiting the second loop, the last value from i is there. Rename the counting variable in that loop and your issue should be gone.
In your case you probably have more dates than stocks, and thus observe the error message you have.

Categories

Resources