I have a python code which works for doing data analytics from csv file. I want to run my python code to be run periodically on a docker container. Every 15 seconds, it should automatically look at a folder A, if there is a csv file in it, it should process it and put an html report with the same name in folder B.
HERE IS MY PYTHON CODE .
#This program pulls data from csv file and displays it as html file.
#csv file contains device names, card names and temperatures of cards
#The html file contains: how many devices, how many cards are in the system, which
#device has the highest temperature card, and in the table below is how many cards are
#there in summary for each device, how many have a temperature of 70 and above, the
#highest and average card what are the temperatures
#NOTE: The print functions in the program are written for trial purposes.
from enum import unique
from re import A, T
import pandas as pd
from prettytable import PrettyTable, PLAIN_COLUMNS
table = PrettyTable() #create a table for device
table2 = PrettyTable() #create a table for summary
table.field_names = ["Device -", "Total # of Cards - ", "High Temp. Cards # - ", "Max Temperature - ", "Avg. Temperature "]
table2.field_names = [" "," "]
df = pd.read_csv("cards.csv", sep=';', usecols = ['Device','Card','Temperature'])""", index_col=["Device","Card"]"""
print(type(df))
print(df["Device"].nunique(),"\n\n") # number of unique server
total_devices = df["Device"].nunique() # NUMBER OF DEVICES IN DIFFERENT TYPES
print(total_devices)
print(df["Device"].loc[1],"\n\n")
print(df['Temperature'].max(),"\n\n")
maxTemp = df['Temperature'].max() #finding max temperature
print("total card ", )
i= 0
j=1
#Finding the card with the max temperature and the server where the card is located
while j > 0:
if df["Temperature"].loc[i] == df["Temperature"].max():
print(df["Device"].loc[i])
print(df["Card"].loc[i])
deviceName = df["Device"].loc[i]
cardName = df["Card"].loc[i]
j= 0
else :
i = i+1
dev_types = df["Device"].unique() # Server's names
print("\n\n")
newstr = cardName + "/" + deviceName
#Summary tablosunu olusturma
table2.add_row(["Total Devices ", total_devices] )
table2.add_row(["Total Cads ", len(df["Card"])])
table2.add_row(["Max Card Temperature ", df["Temperature"].max()])
table2.add_row(["Hottest Card / Device " ,newstr])
print(table2)
row_num = len(df)
print(row_num)
#I pulled the data from the file according to the device type so that the server cards and temperatures were sorted, I found the max temp from here
dn = pd.read_csv("cards.csv", sep=';', index_col=["Device"], usecols = ['Device','Card','Temperature'])
sum = []
high = []
#print("max temp: ", dn["Temperature"].loc[dev_types[1]].max())
for x in range(total_devices): # total devices (according the file = 3 )
print("\n")
cardCount = 0 # counts the number of cards belonging to the device
count2 = 0 # Counts the number of cards with a temperature greater than 70
tempcount = 0
print(dev_types[x])
for y in range(row_num):
if dev_types[x] == df["Device"].loc[y]:
print(df["Temperature"].loc[y])
tempcount = tempcount + df["Temperature"].loc[y] # the sum of the temperatures of the cards(used when calculating the average)
cardCount = cardCount +1
if df["Temperature"].loc[y] >= 70:
count2 = count2 +1
maxT = dn["Temperature"].loc[dev_types[x]].max() #Finding the ones with the max temperature from the cards belonging to the server
avg = str(tempcount/cardCount)
print("avg",avg)
table.add_row([dev_types[x], cardCount, count2, maxT,avg ]) # I added the information to the "devices" table
print("num of cards" , cardCount)
print("high temp cards" , count2)
print("\n\n")
print("\n\n")
print(table)
htmlCode = table.get_html_string()
htmlCode2 = table2.get_html_string()
f= open('devices.html', 'w')
f.write("SUMMARY")
f.write(htmlCode2)
f.write("DEVICES")
f.write(htmlCode)
Whether or not the code is run in Docker doesn't matter.
Wrap all of that current logic (well, not the imports and so on) in a function, say, def process_cards().
Call that function forever, in a loop:
import logging
def process_cards():
table = PrettyTable()
...
def main():
logging.basicConfig()
while True:
try:
process_cards()
except Exception:
logging.exception("Failed processing")
time.sleep(15)
if __name__ == "__main__":
main()
As an aside, your data processing code can be vastly simplified:
import pandas as pd
from prettytable import PrettyTable
def get_summary_table(df):
summary_table = PrettyTable() # create a table for summary
total_devices = df["Device"].nunique()
hottest_card = df.loc[df["Temperature"].idxmax()]
hottest_device_desc = f"{hottest_card.Card}/{hottest_card.Device}"
summary_table.add_row(["Total Devices", total_devices])
summary_table.add_row(["Total Cards", len(df["Card"])])
summary_table.add_row(["Max Card Temperature", df["Temperature"].max()])
summary_table.add_row(["Hottest Card / Device ", hottest_device_desc])
return summary_table
def get_devices_table(df):
devices_table = PrettyTable(
[
"Device",
"Total # of Cards",
"High Temp. Cards #",
"Max Temperature",
"Avg. Temperature",
]
)
for device_name, group in df.groupby("Device"):
count = len(group)
avg_temp = group["Temperature"].mean()
max_temp = group["Temperature"].max()
high_count = group[group.Temperature >= 70]["Temperature"].count()
print(f"{device_name=} {avg_temp=} {max_temp=} {high_count=}")
devices_table.add_row([device_name, count, high_count, max_temp, avg_temp])
return devices_table
def do_processing(csv_file="cards.csv", html_file="devices.html"):
# df = pd.read_csv(csv_file, sep=';', usecols=['Device', 'Card', 'Temperature'])
# (Just some random example data)
df = pd.DataFrame({
"Device": [f"Device {1 + x // 3}" for x in range(10)],
"Card": [f"Card {x + 1}" for x in range(10)],
"Temperature": [59.3, 77.2, 48.5, 60.1, 77.2, 61.1, 77.4, 65.8, 71.2, 60.3],
})
summary_table = get_summary_table(df)
devices_table = get_devices_table(df)
with open(html_file, "w") as f:
f.write(
"<style>table, th, td {border: 1px solid black; border-collapse: collapse;}</style>"
)
f.write("SUMMARY")
f.write(summary_table.get_html_string(header=False))
f.write("DEVICES")
f.write(devices_table.get_html_string())
do_processing()
i have an example of repeat decorator for run your function every seconds or minutes ...
i hope this sample helps you
from typing import Optional, Callable, Awaitable
import asyncio
from functools import wraps
def repeat_every(*, seconds: float, wait_first: bool = False)-> Callable:
def decorator(function: Callable[[], Optional[Awaitable[None]]]):
is_coroutine = asyncio.iscoroutinefunction(function)
#wraps(function)
async def wrapped():
async def loop():
if wait_first:
await asyncio.sleep(seconds)
while True:
try:
if is_coroutine:
await function()
else:
await asyncio.run_in_threadpool(function)
except Exception as e:
raise e
await asyncio.sleep(seconds)
asyncio.create_task(loop())
return wrapped
print("Repeat every working well.")
return decorator
#repeat_every(seconds=2)
async def main():
print(2*2)
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop and loop.is_running():
print('Async event loop already running.')
tsk = loop.create_task(main())
tsk.add_done_callback(
lambda t: print(f'Task done with result= {t.result()}'))
else:
print('Starting new event loop')
asyncio.run(main())
and there is an option that you can make an entrypoint which has cronjob
Related
I am building a vaccination appointment program that automatically assigns a slot to the user.
This builds the table and saves it into a CSV file:
import pandas
start_date = '1/1/2022'
end_date = '31/12/2022'
list_of_date = pandas.date_range(start=start_date, end=end_date)
df = pandas.DataFrame(list_of_date)
df.columns = ['Date/Time']
df['8:00'] = 100
df['9:00'] = 100
df['10:00'] = 100
df['11:00'] = 100
df['12:00'] = 100
df['13:00'] = 100
df['14:00'] = 100
df['15:00'] = 100
df['16:00'] = 100
df['17:00'] = 100
df.to_csv(r'C:\Users\Ric\PycharmProjects\pythonProject\new.csv')
And this code randomly pick a date and an hour from that date in the CSV table we just created:
import pandas
import random
from random import randrange
#randrange randomly picks an index for date and time for the user
random_date = randrange(365)
random_hour = randrange(10)
list = ["8:00", "9:00", "10:00", "11:00", "12:00", "13:00", "14:00", "15:00", "16:00", "17:00"]
hour = random.choice(list)
df = pandas.read_csv('new.csv')
date=df.iloc[random_date][0]
# 1 is substracted from that cell as 1 slot will be assigned to the user
df.loc[random_date, hour] -= 1
df.to_csv(r'C:\Users\Ric\PycharmProjects\pythonProject\new.csv',index=False)
print(date)
print(hour)
I need help with making the program check if the random hour it chose on that date has vacant slots. I can manage the while loops that are needed if the number of vacant slots is 0. And no, I have not tried much because I have no clue of how to do this.
P.S. If you're going to try running the code, please remember to change the save and read location.
Here is how I would do it. I've also cleaned it up a bit.
import random
import pandas as pd
start_date, end_date = '1/1/2022', '31/12/2022'
hours = [f'{hour}:00' for hour in range(8, 18)]
df = pd.DataFrame(
data=pd.date_range(start_date, end_date),
columns=['Date/Time']
)
for hour in hours:
df[hour] = 100
# 1000 simulations
for _ in range(1000):
random_date, random_hour = random.randrange(365), random.choice(hours)
# Check if slot has vacant slot
if df.at[random_date, random_hour] > 0:
df.at[random_date, random_hour] -= 1
else:
# Pass here, but you can add whatever logic you want
# for instance you could give it the next free slot in the same day
pass
print(df.describe())
import pandas
import random
from random import randrange
# randrange randomly picks an index for date and time for the user
random_date = randrange(365)
# random_hour = randrange(10) #consider removing this line since it's not used
lista = [# consider avoid using Python preserved names
"8:00",
"9:00",
"10:00",
"11:00",
"12:00",
"13:00",
"14:00",
"15:00",
"16:00",
"17:00",
]
hour = random.choice(lista)
df = pandas.read_csv("new.csv")
date = df.iloc[random_date][0]
# 1 is substracted from that cell as 1 slot will be assigned to the user
if df.loc[random_date, hour] > 0:#here is what you asked for
df.loc[random_date, hour] -= 1
else:
print(f"No Vacant Slots in {random_date}, {hour}")
df.to_csv(r"new.csv", index=False)
print(date)
print(hour)
Here's another alternative. I'm not sure you really need the very large and slow-to-load pandas module for this. This does it with plan Python structures. I tried to run the simulation until it got a failure, but with 365,000 open slots, and flushing the database to disk each time, it takes too long. I changed the 100 to 8, just to see it find a dup in reasonable time.
import csv
import datetime
import random
def create():
start = datetime.date( 2022, 1, 1 )
oneday = datetime.timedelta(days=1)
headers = ["date"] + [f"{i}:00" for i in range(8,18)]
data = []
for _ in range(365):
data.append( [start.strftime("%Y-%m-%d")] + [8]*10 ) # not 100
start += oneday
write( headers, data )
def write(headers, rows):
fcsv = csv.writer(open('data.csv','w',newline=''))
fcsv.writerow( headers )
fcsv.writerows( rows )
def read():
days = []
headers = []
for row in csv.reader(open('data.csv')):
if not headers:
headers = row
else:
days.append( [row[0]] + list(map(int,row[1:])))
return headers, days
def choose( headers, days ):
random_date = random.randrange(365)
random_hour = random.randrange(len(headers)-1)+1
choice = days[random_date][0] + " " + headers[random_hour]
print( "Chose", choice )
if days[random_date][random_hour]:
days[random_date][random_hour] -= 1
write(headers,days)
return choice
else:
print("Randomly chosen slot is full.")
return None
create()
data = read()
while choose( *data ):
pass
Outcome 1 required:
The first batch of code below is in its working form.
Please assist in creating a function " def Calculations():" inclusive of all the list calculations to return the same results with the static list. With the calculations in proper functions I will be able to refine the problem and might be able to move forward ...
Outcome 2 required for those that want to go in depth...:
When I run the code on a live list that appends every x intervals it stalls the information feed. I believe it could be the creating of the appending lists in batches of increasing numbers... but I don't have a solution for it... Below is the working code...
I am getting my live data from Binance in a appending list of closes only for those who would like to test it in the live status...
The data could be coming from any source , does not need to be Binance as long as its an appending list of closes in float format...
See code below...
import itertools
l = [16.329,16.331, 16.3705, 16.3965, 16.44, 16.4227, 16.4028, 16.37, 16.3829, 16.3482, 16.3614, 16.4191, 16.4008, 16.4048, 16.4076, 16.3724, 16.3599, 16.3872, 16.3794, 16.3528, 16.3886, 16.3904, 16.3815, 16.3864, 16.4254, 16.4411, 16.4151, 16.4338, 16.4212, 16.3819, 16.2857, 16.2703, 16.2408, 16.1938, 16.2038, 16.2035, 16.217, 16.2374, 16.2414, 16.2238, 16.1787, 16.2725, 16.2964, 16.3155, 16.238, 16.2149, 16.2992, 16.3568, 16.2793, 16.2467, 16.312, 16.3117, 16.3017, 16.3465, 16.3882, 16.3698, 16.307, 16.3328, 16.3311, 16.3466, 16.3382, 16.3703, 16.3502, 16.3661, 16.38, 16.3972, 16.4141, 16.393, 16.3769, 16.3683, 16.4136, 16.3774, 16.3709, 16.3179, 16.3019, 16.3149, 16.2838, 16.2689, 16.2602, 16.2679, 16.2921, 16.312, 16.3158, 16.3198, 16.2955, 16.303, 16.327, 16.356, 16.313, 16.3, 16.2806, 16.2634, 16.2856, 16.2702, 16.2136, 16.2782, 16.276, 16.2231, 16.2255, 16.1314, 16.0796, 16.1192, 16.0977, 16.1358, 16.1408, 16.1703]
#### VARIABLES & SETTINGS ####
dataingestperiod = 17
original_list_count = len(l)
timeframed_list = l[-dataingestperiod:]
timeframed_list_count = len(timeframed_list)
def groupSequence(x):
it = iter(x)
prev, res = next(it), []
while prev is not None:
start = next(it, None)
if start and start > prev:
res.append(prev)
elif res:
yield list(res + [prev])
res = []
prev = start
def divbyZero(increasingcount,decreasingcount):
try: return increasingcount/decreasingcount
except ZeroDivisionError: return 0
def divbyZeroMain(increasingcountMain,decreasingcountMain):
try: return increasingcountMain/decreasingcountMain
except ZeroDivisionError: return 0
#### TIMEFRAMED LIST CALCULATIONS#####
listA_1 = (list(groupSequence(timeframed_list))) # obtain numbers in increasing sequence
# print(len(listA_1)) # number of increases in mixed format
listA = list(itertools.chain.from_iterable(listA_1)) # remove double brackets to enable list count
increasingcount = len(listA)
decreasingcount = timeframed_list_count - increasingcount
trend = divbyZero(increasingcount,decreasingcount)
#### MAIN APPENDING LIST CALCULATIONS #####
listMain_1 = (list(groupSequence(l)))
listMain = list(itertools.chain.from_iterable(listMain_1))
increasingcountMain = len(listMain)
decreasingcountMain = original_list_count - increasingcountMain
trendMain = divbyZeroMain(increasingcountMain,decreasingcountMain)
###Timeframed list increases-only appending to max last"dataingestperiod" perhaps problem on live feed data....###
increase_list_timeframed = []
for x in listA:
increase_list_timeframed.append(x)
### Main list increases only appending...####
increase_list_Main = []
for x in listMain:
increase_list_Main.append(x)
###Prints ON TIMEFRAMED LIST ####
print ("---------------")
print ("---------------")
print ("Timeframed list count set at max {}".format(timeframed_list_count))
print ("Count of decreases in timeframed list is {}".format(decreasingcount))
print ("Count of increases in timeframed list is {}".format(increasingcount))
print ("Current timeframed trend is {}".format(trend))
print ("---------------")
print ("---------------")
###Prints ON MAIN LIST ####
print ("Main appending list count so far is {}".format(original_list_count))
print ("Count of decreases in Main appending list is {}".format(decreasingcountMain))
print ("Count of increases in Main appending list is {}".format(increasingcountMain))
print ("Current Main trend is {}".format(trendMain))
The actual code as live to binance is listed below with the above code included. You also need to install "pip install python-binance" and "pip install websocket_client" got the binance access code from ParttimeLarry
Outcome 2 required: When run live that all calculations run uninterruptedly...
import itertools
import copy
import websocket, json, pprint, talib, numpy
from binance.client import Client
from binance.enums import *
#DATA FROM WEBSOCKETS########
SOCKET = "wss://stream.binance.com:9443/ws/linkusdt#kline_1m"
API_KEY = 'yourAPI_KEY'
API_SECRET ='yourAPI_Secret'
closes = [] # created for RSI indicator only using closes
in_position = False
client = Client(API_KEY, API_SECRET) # tld='us'
def order(side, quantity, symbol,order_type=ORDER_TYPE_MARKET):
try:
print("sending order")
order = client.create_order(symbol=symbol, side=side, type=order_type, quantity=quantity)
print(order)
except Exception as e:
print("an exception occured - {}".format(e))
return False
return True
def on_open(ws):
print('opened connection')
# start_time = datetime.datetime.now().time().strftime('%H:%M:%S')
# try:
# file = open("C:/GITPROJECTS/binance-bot/csvstats.txt","+a")
# file.write("New Session Open Connection Start at time {}\n".format(datetime.datetime.now())))
# finally:
# file.close()
def on_close(ws):
print('closed connection')
def on_message(ws, message):
global closes, in_position
print('received message')
json_message = json.loads(message)
pprint.pprint(json_message)
candle = json_message['k']
is_candle_closed = candle['x']
close = candle['c']
if is_candle_closed:
print("candle closed at {}".format(close))
closes.append(float(close))
print("closes")
print(closes)
####################################################################################
########CALCULATIONS ON INDICATORS #################################################
# dataingestperiod = 5
l = copy.deepcopy(closes)
maincountofcloses = len(l)
print ("Total count of closes so far {}".format(maincountofcloses))
#### VARIABLES & SETTINGS ####
l = copy.deepcopy(closes)
dataingestperiod = 3
original_list_count = len(l)
#print ("Main list count so far is {}".format(original_list_count))
timeframed_list = l[-dataingestperiod:]
timeframed_list_count = len(timeframed_list)
#print ("Timeframed list count set at max {}".format(timeframed_list_count))
def groupSequence(x):
it = iter(x)
prev, res = next(it), []
while prev is not None:
start = next(it, None)
if start and start > prev:
res.append(prev)
elif res:
yield list(res + [prev])
res = []
prev = start
def divbyZero(increasingcount,decreasingcount):
try: return increasingcount/decreasingcount
except ZeroDivisionError: return 0
def divbyZeroMain(increasingcountMain,decreasingcountMain):
try: return increasingcountMain/decreasingcountMain
except ZeroDivisionError: return 0
#### TIMEFRAMED LIST CALCULATIONS#####
listA_1 = (list(groupSequence(timeframed_list))) # obtain numbers in increasing sequence
# print(len(listA_1)) # number of increases in mixed format
listA = list(itertools.chain.from_iterable(listA_1)) # remove double brackets to enable list count
increasingcount = len(listA)
decreasingcount = timeframed_list_count - increasingcount
trend = divbyZero(increasingcount,decreasingcount)
#### MAIN APPENDING LIST CALCULATIONS #####
listMain_1 = (list(groupSequence(l)))
listMain = list(itertools.chain.from_iterable(listMain_1))
increasingcountMain = len(listMain)
decreasingcountMain = original_list_count - increasingcountMain
trendMain = divbyZeroMain(increasingcountMain,decreasingcountMain)
increase_list_timeframed = []
for x in listA:
increase_list_timeframed.append(x)
increase_list_Main = []
for x in listMain:
increase_list_Main.append(x)
###Prints ON TIMEFRAMED LIST ####
print ("---------------")
print ("---------------")
print ("Timeframed list count set at max {}".format(timeframed_list_count))
print ("Count of decreases in timeframed list is {}".format(decreasingcount))
print ("Count of increases in timeframed list is {}".format(increasingcount))
print ("Current timeframed trend is {}".format(trend))
print ("---------------")
print ("---------------")
###Prints ON MAIN LIST ####
print ("Main appending list count so far is {}".format(original_list_count))
print ("Count of decreases in Main appending list is {}".format(decreasingcountMain))
print ("Count of increases in Main appending list is {}".format(increasingcountMain))
print ("Current Main trend is {}".format(trendMain))
# threading.Timer(10.0, divbyZeroMain).start()
# threading.Timer(10.0, divbyZero).start()
# ws = websocket.WebSocketApp(SOCKET, on_open=on_open, on_close=on_close, on_message=on_message)
# ws.run_forever()
ws = websocket.WebSocketApp(SOCKET, on_open=on_open, on_close=on_close, on_message=on_message)
ws.run_forever()
My aim is to increase the throughput of versioning data in Cassandra. I have used concurrent reads and writes and have also increased the chunk size that my code reads from the file. My machine is 16gb with 8 cores and yes, I have changed Cassandra's yaml file for 10k concurrent reads and writes and when timed it, I found out that 10000 writes/reads takes less than a second.
My minimal, viable code is:
import json
import logging
import os
import sys
from datetime import datetime
from hashlib import sha256, sha512, sha1
import pandas as pd
from cassandra import ConsistencyLevel, WriteTimeout
from cassandra.cluster import (EXEC_PROFILE_DEFAULT, BatchStatement, Cluster,
ExecutionProfile)
from cassandra.concurrent import (execute_concurrent,
execute_concurrent_with_args)
from cassandra.query import SimpleStatement, dict_factory
class PythonCassandraExample:
def __init__(self, file_to_be_versioned, working_dir=os.getcwd(), mode='append'):
self.cluster = None
self.session = None
self.keyspace = None
self.log = None
self.mode = mode
self.file_to_be_versioned = file_to_be_versioned
self.insert_patch = []
self.delete_patch = []
self.update_patch = []
self.working_dir = working_dir
def __del__(self):
self.cluster.shutdown()
def createsession(self):
profile = ExecutionProfile(
row_factory=dict_factory,
request_timeout=6000
)
self.cluster = Cluster(
['localhost'],
connect_timeout=50,
execution_profiles={
EXEC_PROFILE_DEFAULT: profile
}
)
self.session = self.cluster.connect(self.keyspace)
def getsession(self):
return self.session
# How about Adding some log info to see what went wrong
def setlogger(self):
log = logging.getLogger()
log.setLevel('INFO')
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"))
log.addHandler(handler)
self.log = log
# Create Keyspace based on Given Name
def handle_error(self, exception):
self.log.error("Failed to fetch user info: %s", exception)
def createkeyspace(self, keyspace):
"""
:param keyspace: The Name of Keyspace to be created
:return:
"""
# Before we create new lets check if exiting keyspace; we will drop that and create new
self.log.info("creating keyspace...")
self.session.execute("""
CREATE KEYSPACE IF NOT EXISTS %s
WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '1' }
""" % keyspace)
self.log.info("setting keyspace...")
self.keyspace = keyspace
self.session.set_keyspace(self.keyspace)
def create_table_and_set_version(self, table_name):
self.table_name = table_name.lower()
table_select_query = "SELECT table_name FROM system_schema.tables WHERE keyspace_name='{}' AND table_name='{}'".format(
self.keyspace, self.table_name)
print(table_select_query)
table_exists = self.session.execute(table_select_query).one()
self.log.info("Table exists: {}".format(table_exists))
if table_exists:
self.log.info(
"Datapackage already exists! Checking the last version")
self.version = self.session.execute(
"SELECT version FROM {} LIMIT 1".format(self.table_name)).one()
self.log.info(
"The version fetched is: {} of type: {}".format(
self.version, type(self.version)
)
)
if not self.version:
self.version = 0
else:
self.version = self.version['version']
else:
self.log.info("Table didn't exist!")
self.version = 0
self.target_version = int(str(self.version)) + 1
self.log.info(
"Current and candidate versions are: {}, {}".format(
self.version,
self.target_version
)
)
# c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version int, row varchar, row_hash varchar, PRIMARY KEY(id, version)) with clustering order by (version desc)".format(
# self.table_name)
c_sql = "CREATE TABLE IF NOT EXISTS {} (id varchar, version int, row_hash varchar, PRIMARY KEY(version, id))".format(
self.table_name
)
self.session.execute(c_sql)
self.log.info("DP Table Created !!!")
self.log.info("Current and candidate versions are: {}, {}".format(
self.version, self.target_version))
def push_to_update_patch(self, update_patch_file, last_patch=False):
if len(self.update_patch) >= 10000:
with open(update_patch_file, mode='a') as json_file:
json_file.writelines(
self.update_patch
)
del self.update_patch[:]
if last_patch is True and len(self.update_patch) > 0:
with open(update_patch_file, mode='a') as json_file:
json_file.writelines(
self.update_patch
)
del self.update_patch[:]
def push_to_insert_patch(self, insert_patch_file, last_patch=False):
if len(self.insert_patch) >= 10000:
with open(insert_patch_file, mode='a') as json_file:
json_file.writelines(
self.insert_patch
)
del self.insert_patch[:]
if last_patch is True and len(self.update_patch) > 0:
with open(insert_patch_file, mode='a') as json_file:
json_file.writelines(
self.insert_patch
)
del self.insert_patch[:]
def push_to_delete_patch(self, delete_patch_file, last_patch=False):
if len(self.delete_patch) >= 10000:
with open(delete_patch_file, mode='a') as json_file:
json_file.writelines(
self.delete_patch
)
del self.delete_patch[:]
if last_patch is True and len(self.delete_patch) > 0:
with open(delete_patch_file, mode='a') as json_file:
json_file.writelines(
self.delete_patch
)
del self.delete_patch[:]
def push_to_patch(self, key, value, mode='update'):
return
if key is None or value is None:
raise ValueError(
"Key or value or both not specified for making a patch. Exiting now."
)
data = {}
data["id"] = str(key)
data["data"] = json.dumps(value, default=str)
# convert dict to json str so that the patch is a list of line jsons.
data = json.dumps(data, default=str)
json_patch_file = os.path.join(
self.working_dir,
"version_{}_{}.json".format(
self.target_version, mode
)
)
if mode == 'update':
self.update_patch.append(
data + "\n"
)
self.push_to_update_patch(
json_patch_file
)
if mode == 'insert':
self.insert_patch.append(
data + "\n"
)
self.push_to_insert_patch(
json_patch_file
)
if mode == 'delete':
self.delete_patch.append(
data + "\n"
)
self.push_to_delete_patch(
json_patch_file
)
def clone_version(self):
if self.mode == 'replace':
return
self.log.info("Cloning version")
start_time = datetime.utcnow()
if self.version == 0:
return
insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row_hash"
)
)
futures = []
current_version_query = "SELECT id, row_hash FROM {} WHERE version={}".format(
self.table_name, self.version
)
current_version_rows = self.session.execute(
current_version_query
)
for current_version_row in current_version_rows:
params = (
current_version_row['id'],
self.target_version,
current_version_row['row_hash']
)
futures.append(
(
insert_sql,
params
)
)
self.log.info(
"Time taken to clone the version is: {}".format(
datetime.utcnow() - start_time
)
)
def hash_string(self, value):
return (sha1(str(value).encode('utf-8')).hexdigest())
def hash_row(self, row):
row_json = json.dumps(row, default=str)
return (self.hash_string(row_json))
def insert_data(self, generate_diff=False):
self.generate_diff = generate_diff
destination = self.file_to_be_versioned
chunksize = 100000
concurrency_value = 1000
patch_length_for_cql = chunksize
chunks = pd.read_csv(destination, chunksize=chunksize)
chunk_counter = 0
insert_sql = self.session.prepare(
(
"INSERT INTO {} ({}, {}, {}) VALUES (?,?,?)"
).format(
self.table_name, "id", "version", "row_hash"
)
)
select_sql = self.session.prepare(
(
"SELECT id, version, row_hash FROM {} WHERE version=? AND id=?"
).format(
self.table_name
)
)
futures = []
check_for_patch = [] #this list comprises rows with ids and values for checking whether its an update/insert
rows_for_checking_patch = []
start_time = datetime.utcnow()
for df in chunks:
rows_for_checking_patch = df.values.tolist()
chunk_counter += 1
df["row_hash"] = df.apply(
self.hash_row
)
df["key"] = df["column_test_3"].apply(
self.hash_string
)
keys = list(df["key"])
row_hashes = list(df["row_hash"])
start_time_de_params = datetime.utcnow()
for i in range(chunksize):
row_check = None
params = (
str(keys[i]),
self.target_version,
str(row_hashes[i])
)
check_for_patch_params = (
self.version,
str(keys[i])
)
check_for_patch.append(
(
select_sql,
check_for_patch_params
)
)
futures.append(
(
insert_sql,
params
)
)
self.log.info("Time for params: {}".format(datetime.utcnow() - start_time_de_params))
if len(check_for_patch) >= patch_length_for_cql:
start_time_de_update = datetime.utcnow()
results = execute_concurrent(
self.session, check_for_patch, concurrency=concurrency_value, raise_on_first_error=False
)
self.log.info("Time for just the query: {}".format(datetime.utcnow() - start_time_de_update))
row_counter_for_patch = 0
for (success, result) in results:
if not result:
self.push_to_patch(
keys[row_counter_for_patch],
rows_for_checking_patch[row_counter_for_patch],
mode='insert'
)
row_counter_for_patch += 1
continue
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
id_to_be_compared = result[0]["id"]
row_hash_to_be_compared = result[0]["row_hash"]
if (row_hash_to_be_compared != row_hashes[row_counter_for_patch]):
self.push_to_patch(
id_to_be_compared,
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='update'
)
row_counter_for_patch += 1
del check_for_patch[:]
del rows_for_checking_patch[:]
row_counter_for_patch = 0
self.log.info("Time for check patch: {}".format(
datetime.utcnow() - start_time_de_update
))
if (len(futures) >= patch_length_for_cql):
start_time_de_insert = datetime.utcnow()
results = execute_concurrent(
self.session, futures, concurrency=concurrency_value, raise_on_first_error=False
)
for (success, result) in results:
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
del futures[:]
self.log.info("Time for insert patch: {}".format(
datetime.utcnow() - start_time_de_insert
))
self.log.info(chunk_counter)
# self.log.info("This chunk got over in {}".format(datetime.utcnow() - start_time))
if len(check_for_patch) > 0:
results = execute_concurrent(
self.session, check_for_patch, concurrency=concurrency_value, raise_on_first_error=False
)
row_counter_for_patch = 0
for (success, result) in results:
if not result:
self.push_to_patch(
rows_for_checking_patch[row_counter_for_patch]["id"],
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='insert'
)
row_counter_for_patch += 1
continue
if not success:
# result will be an Exception
self.log.error("Error has occurred in insert cql")
self.handle_error(result)
id_to_be_compared = result[0]["id"]
row_hash_to_be_compared = result[0]["row_hash"]
if (row_hash_to_be_compared != rows_for_checking_patch[row_counter_for_patch]["row_hash"]):
self.push_to_patch(
id_to_be_compared,
rows_for_checking_patch[row_counter_for_patch]["row"],
mode='update'
)
row_counter_for_patch += 1
del check_for_patch[:]
del rows_for_checking_patch[:]
if len(futures) > 0: # in case the last dataframe has #rows < 10k.
results = execute_concurrent(
self.session, futures, concurrency=concurrency_value, raise_on_first_error=False
)
for (success, result) in results:
if not success:
self.handle_error(result)
del futures[:]
self.log.info(chunk_counter)
# Check the delete patch
if self.generate_diff is True and self.mode is 'replace' and self.version is not 0:
self.log.info("We got to find the delete patch!")
start_time = datetime.utcnow()
current_version_query = "SELECT id, row, row_hash FROM {} WHERE version={}".format(
self.table_name, self.version
)
current_version_rows = self.session.execute(
current_version_query
)
for current_version_row in current_version_rows:
row_check_query = "SELECT {} FROM {} WHERE {}={} AND {}='{}' ".format(
"id", self.table_name, "version", self.target_version, "id", current_version_row.id
)
row_check = self.session.execute(row_check_query).one()
if row_check is not None: # row exists in both version.
continue
self.push_to_patch(
current_version_row.id,
current_version_row.id,
mode="delete"
)
print("Complete insert's duration is: {}".format(
datetime.utcnow() - start_time)
)
# Calling last_patch for all remaining diffs
modes = [
'update',
'insert',
'delete'
]
for mode in modes:
json_patch_file = os.path.join(
self.working_dir,
"version_{}_{}.json".format(
self.target_version, mode
)
)
if mode == 'update':
self.push_to_update_patch(
json_patch_file,
last_patch=True
)
if mode == 'insert':
self.push_to_insert_patch(
json_patch_file,
last_patch=True
)
if mode == 'delete':
self.push_to_delete_patch(
json_patch_file,
last_patch=True
)
if __name__ == '__main__':
example1 = PythonCassandraExample(
file_to_be_versioned="hundred_million_eleven_columns.csv"
)
example1.createsession()
example1.setlogger()
example1.createkeyspace('sat_athena_one')
example1.create_table_and_set_version('five_hundred_rows')
example1.clone_version()
example1.insert_data(generate_diff=True)
I have a csv file of 100M rows and 11 cols. The script used to generate such a file is:
import csv
import sys
import os
import pandas as pd
file_name = "hundred_million_eleven_columns.csv"
rows_list = []
chunk_counter = 1
headers = [
"column_test_1",
"column_test_2",
"column_test_3",
"column_test_4",
"column_test_5",
"column_test_6",
"column_test_7",
"column_test_8",
"column_test_9",
"column_test_10",
"column_test_11",
]
file_exists = os.path.isfile(file_name)
with open(file_name, 'a') as csvfile:
writer = csv.DictWriter(csvfile, delimiter=',',
lineterminator='\n', fieldnames=headers)
if not file_exists:
writer.writeheader() # file doesn't exist yet, write a header
for i in range(100000000):
dict1 = [
i, i+1, i+2, i+3, i+4, i+5, i+6, i+7, i+8, i+9, i+10
]
# get input row in dictionary format
# key = col_name
rows_list.append(dict1)
if len(rows_list) == 100000:
df = pd.DataFrame(rows_list)
df.to_csv(file_name,
mode='a', index=False, header=False)
del rows_list[:]
del df
print(chunk_counter)
chunk_counter += 1
if len(rows_list) > 0:
df = pd.DataFrame(rows_list)
df.to_csv(file_name, mode='a', index=False, header=False)
del rows_list[:]
del df
print(chunk_counter)
chunk_counter += 1
My cassandra's yaml file is here
Make sure that your code can even generate that much at 50k. If you remove the execute's, can you even read the CSV and generate the sha that fast? A C* instance on that sized host with SSDs should be able to do 50k writes/sec but theres a lot going on outside of the C* writes that are likely part of the issue.
If your concurrent reads/writes are above 128 you are going to have some serious issues. On a system that can handle it 64 even is enough be able to go past 200k writes a sec. You are actually going to make things much worse with that high of a setting. There is no IO involved in that so as the documentation states, 8x your core count is a good value. I would even recommend lowering the concurrency your pushing from 10k to like 1024 or even lower. You can play with different settings to see how it impacts things.
Make sure python was compiled with cython on your install as its going to be dominated on the serialization otherwise. Speaking of the python driver is the slowest so keep that in mind.
Your blocking on the sha can be a majority of the time. Without perf tracing - just try it with a fixed value to see the difference.
"My machine" -- is this a single node cluster? If your throwing availability out the window might as well disable durable_writes on the keyspace to speed up the writes a bit. Missing heap settings but make sure you have a minimum of 8gb, even if this is a pretty small host cassandra needs memory. If not reading consider disabling the keycache and maybe disabling compactions while the job is running and then enable afterwards.
Comment recomends 8 * number of cores.
On the other hand, since writes are almost never IO bound, the ideal
number of "concurrent_writes" is dependent on the number of cores in
your system; (8 * number_of_cores) is a good rule of thumb.
64 is proper in 8core machine.
concurrent_reads: 64
concurrent_writes: 64
concurrent_counter_writes: 64
This limits are may recommended because there are many other io operations except normal IO. ex) writting commit log, caching, compaction, replication, view (if exist)
Some rules of thumb
disk_optimization_strategy: ssd // If your disk is hdd, chage value to spinning
use dedicated commit log disk. ssd recommended.
more disks = better performance
I get this error when I run the script and I cannot see the solution. This program is supposed to draw a giveaway from a sqlite3 file which has the number of raffle tickets for a user. And recently the program the gives that creates the sqlite3 file updated some stuff (The script is made by me) and I can figure out the solution.
Traceback (most recent call last):
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 244, in <module>
dd = DaveDraw()
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 64, in __init__
self.get_viewers()
File "C:\Users\Admin\Desktop\Draw\Test\dave-draw.py", line 215, in
get_viewers
''').fetchall()
sqlite3.OperationalError: no such column: viewer_id
there's the code
#!/usr/bin/env python3
import pdb
import random
import sqlite3
class Viewer(object):
def __init__(self,
viewer_id,
twitch_name,
beam_name,
beam_id,
viewer_type,
rank,
points,
points2,
hours,
raids,
gains_currency,
gains_hours,
in_giveaways,
last_seen,
sub,
entrance_message,
entrance_message_type,
entrance_sfx
):
self.viewer_id = viewer_id
self.twitch_name = twitch_name
self.beam_name = beam_name
self.beam_id = beam_id
self.viewer_type = viewer_type
self.rank = rank
self.points = points
self.points2 = points2
self.hours = hours
self.raids = raids
self.gains_currency = gains_currency
self.gains_hours = gains_hours
self.in_giveaways = in_giveaways
self.last_seen = last_seen
self.sub = sub
self.entrance_message = entrance_message
self.entrance_message_type = entrance_message_type
self.entrance_sfx = entrance_sfx
def win_chance(self, total_tickets):
"""
Takes the total tickets (points) as a paramter and works
out the percentage chance that the viewer has of winning.
Returns the viewers win chance in percent.
"""
percent = total_tickets / 100.00
return self.points2 / percent
class DaveDraw(object):
def __init__(self):
self.debug = False
self.database_path = 'Viewers3DB.sqlite'
self.db_conn = sqlite3.connect(self.database_path)
self.get_viewers()
self.calculate_total_points()
self.assign_tickets()
def assign_tickets(self):
"""
Assigns each user a number range based on the number of
tickets they have.
e.g.
10 1-10
10 11-20
30 21-50
1 51
"""
self.tickets = {}
latest_ticket = 0
for viewer in self.viewers:
# skip anyone with no points
if viewer.points2 == 0:
continue
ticket_range_beg = latest_ticket + 1
ticket_range_end = latest_ticket + 1 + viewer.points2
latest_ticket = ticket_range_end
viewer.tickets = range(ticket_range_beg, ticket_range_end)
# assign a range of tickets:
if self.debug:
print("Assigning viewer twitch: %s beam: %s tickets %i-%i" % (viewer.twitch_name, viewer.beam_name, viewer.tickets.start, viewer.tickets.stop))
if ticket_range_beg == ticket_range_end:
if self.debug:
print("Assigning ticket {} to {}".format(ticket_range_beg, viewer.twitch_name))
self.tickets[ticket_range_beg] = viewer
next
for ticket in viewer.tickets:
if self.debug:
print("Assigning ticket {} to {}".format(ticket, viewer.twitch_name))
self.tickets[ticket] = viewer
def calculate_total_points(self):
"""
Gets the total amount of points awarded to all
viewers.
"""
self.total_points = 0
for viewer in self.viewers:
self.total_points += viewer.points2
self.total_points_percent = self.total_points / 100
print("Total points awarded (total tickets): %s" % self.total_points)
def draw(self):
"""
Picks a random number between 1 and total tickets, finds
the user that has been assigned tickets within that range and
returns the user.
"""
ticket = random.randint(1, self.total_points)
try:
winner = self.tickets[ticket]
except:
pdb.set_trace()
print("\n===== WINNER Twitch: {} / Beam: {} =====\n".format(winner.twitch_name, winner.beam_id))
print("Picked ticket {}\n".format(ticket))
print("Winner win chance: {:f}".format(winner.win_chance(self.total_points)))
print("Winner's ticket range: {}-{}".format(winner.tickets.start, winner.tickets.stop))
print("Winner's ticket amount: {}\n".format(winner.points2))
self.display_viewer(winner)
def display_random_viewer(self):
"""
Displays random viewer.
"""
self.display_viewer(self.get_random_viewer())
def display_viewer(self, viewer):
"""
Outputs the data on all viewers.
"""
print("""Viewer ID: %s\nTwitch Name: %s\nBeam Name: %s\nBeam ID: %s\nRank: %s\nPoints: %s\nPoints2: %s\nHours: %s\nRaids: %s\nGains Currency: %s\nGains Hours: %s\nInGiveaways: %s\nLastSeen: %s\nEntrance Message: %s\nEntranceMsgType: %s\nEntranceSFX: %s"""
% (
viewer.viewer_id,
viewer.twitch_name,
viewer.beam_name,
viewer.beam_id,
viewer.rank,
viewer.points,
viewer.points2,
viewer.hours,
viewer.raids,
viewer.gains_currency,
viewer.gains_hours,
viewer.in_giveaways,
viewer.last_seen,
viewer.entrance_message,
viewer.entrance_message_type,
viewer.entrance_sfx
)
)
def get_random_viewer(self):
"""
Gets a completely random viewer.
"""
return random.choice(self.viewers)
def get_viewers(self):
"""
Gets data on all the viewers in the database and stores
the data in self.viewers.
"""
c = self.db_conn.cursor()
viewers = c.execute('''
SELECT
viewer_id,
TwitchName,
BeamName,
BeamID,
Type,
Rank,
Points,
Points2,
Hours,
Raids,
GainsCurrency,
GainsHours,
InGiveaways,
LastSeen,
Sub,
EntranceMessage,
EntranceMsgType,
EntranceSFX
FROM Viewer
WHERE Type != 1
AND TwitchName NOT IN (
\'treeboydave\',
\'treebotdave\'
);
''').fetchall()
self.viewers = []
for cur_viewer in viewers:
self.viewers.append(
Viewer(
cur_viewer[0],
cur_viewer[1],
cur_viewer[2],
cur_viewer[3],
cur_viewer[4],
cur_viewer[5],
cur_viewer[6],
cur_viewer[7],
cur_viewer[8],
cur_viewer[9],
cur_viewer[10],
cur_viewer[11],
cur_viewer[12],
cur_viewer[13],
cur_viewer[14],
cur_viewer[15],
cur_viewer[16],
cur_viewer[17]
)
)
if __name__ == '__main__':
dd = DaveDraw()
dd.draw()
All your other SQL columns are capitalised, any chance that's why it's not finding the viewer_id column? Maybe it's Viewer_Id or similar?
If you sql execute 'HELP TABLE Viewer' and print what it returns, it will give you an outline of all of the columns in that database table, so you can make sure you have the capitalisation correct, or whether the column actually isn't there at all.
Problem: I am trying to extract data through an API Service. A single request can take anywhere from 3 to 10 seconds. There are roughly 20,000 rows of data from a Pandas DataFrame to input into the API Call. I have managed to speed it up a bit through multiprocessing, but it's still running very slow. Any suggestions?
Code:
def scored_card_features2(source, n_batches):
"""Multiprocessing version of Scored Card Features Function
Returns reason for rating
"""
# read in source data and convert to list of lists for inputs
data = pd.read_excel(source)
data = data[['primary_bank_report_id', 'primary_tu_credit_report_id', 'purpose']]
inputs = data.values.tolist()
def scored_card_map(i):
"""form request to scored card service and retrieve values"""
url = "url/FourthGen?bank_report_id=%s&credit_report_id=%s&" \
"&loan_purpose=%s" % (i[0], i[1], i[2].replace(" ", "%20"))
r = requests.get(url)
try:
d = json.loads(r.text)
l = [d['probability_of_default'],
d['condition'],
d['purpose_of_loan'],
d['rating'],
d['bank_report_id'],
d['reason_for_rating'],
d['credit_report_id']]
return l
except:
l = [np.nan] * 7
return l
# inititate multithreading
with Pool(n_batches) as p:
vals = p.map(scored_card_map, inputs)
result = pd.DataFrame(vals, columns=['Probability of Default', 'Condition', 'Purpose of Loan', 'Rating', 'Bank Report ID',
'Reason for Rating', 'Credit Report ID'])
result = result.dropna(how='all')
return result
if __name__ == '__main__':
# model features
start = time.time()
df = scored_card_features2('BankCreditPortalIDsPurpose.xlsx', multiprocessing.cpu_count()-1)
df.to_csv('scored_card_features.csv', index=False)
end = time.time()
print(end-start)