I have a class for multiprocessing in Python which creates 3 different processes. First process is for checking if there is any signal from my hardware and pushing it into a Queue, second process is for getting the data out of the Queue and pushing it into a database and the third processes is for getting the data out of the database and pushing it on a server.
obj = QE()
stdFunct = standardFunctions()
watchDogProcess = multiprocessing.Process(target=obj.watchDog)
watchDogProcess.start()
pushToDBSProcess = multiprocessing.Process(target=obj.pushToDBS)
pushToDBSProcess.start()
pushToCloud = multiprocessing.Process(target=stdFunct.uploadCycleTime)
pushToCloud.start()
watchDogProcess.join()
pushToDBSProcess.join()
pushToCloud.join()
My first two processes are running perfectly as desired, however I am struggling with the third process. The following is the code of my third process :
def uploadCycleTime(self):
while True:
uploadCycles = []
lastUpPointer = "SELECT id FROM lastUploaded"
lastUpPointer = self.dbFetchone(lastUpPointer)
lastUpPointer = lastUpPointer[0]
# print("lastUploaded :"+str(lastUpPointer))
cyclesToUploadSQL = "SELECT id,machineId,startDateTime,endDateTime,type FROM cycletimes WHERE id > "+str(lastUpPointer)
cyclesToUpload = self.dbfetchMany(cyclesToUploadSQL,15)
cyclesUploadLength = len(cyclesToUpload)
if(cyclesUploadLength>0):
for cycles in cyclesToUpload:
uploadCycles.append({"dataId":cycles[0],"machineId":cycles[1],"startDateTime":cycles[2].strftime('%Y-%m-%d %H:%M:%S.%f'),"endDateTime":cycles[3].strftime('%Y-%m-%d %H:%M:%S.%f'),"type":cycles[4]})
# print("length : "+str(cyclesUploadLength))
lastUpPointer = uploadCycles[cyclesUploadLength-1]["dataId"]
uploadCycles = json.dumps(uploadCycles)
api = self.dalUrl+"/cycle-times"
uploadResponse = self.callPostAPI(api,str(uploadCycles))
print(lastUpPointer)
changePointerSQL = "UPDATE lastUploaded SET id="+str(lastUpPointer)
try:
changePointerSQL = self.dbAbstraction(changePointerSQL)
except Exception as errorPointer:
print("Pointer change Error : "+str(errorPointer))
time.sleep(2)
Now I am saving a pointer to remember the last id uploaded, and from there on keep uploading 15 packets. When there is data existing in the DB the code works well, however if there is no existing when the process is initiated and data is sent afterwards then it fails to fetch the data from the DB.
I tried printing the length in realtime, it keeps giving me 0, inspite of data being continuously pushed into the DB in real-time.
In my upload process, I missed out on a commit()
def dbFetchAll(self,dataString):
# dataToPush = self.cycletimeQueue.get()
# print(dataToPush)
dbTry = 1
try:
while(dbTry == 1): # This while is to ensure the data has been pushed
sql = dataString
self.conn.execute(sql)
response = self.conn.fetchall()
dbTry = 0
return response
# print(self.conn.rowcount, "record inserted.")
except Exception as error:
print ("Error : "+str(error))
return dbTry
***finally:
self.mydb.commit()***
Related
Is it possible to use multi processing in Django on a request.
#so if I send a request to http://127.0.0.1:8000/wallet_verify
def wallet_verify(request):
walelts = botactive.objects.all()
#here I check if the user want to be included in the process or not so if they set it to True then i'll include them else ignore.
for active in walelts:
check_active = active.active
if check_active == True:
user_is_active = active.user
#for the ones that want to be included I then go to get their key data.
I need to get both api and secret so then I loop through to get the data from active users.
database = Bybitapidatas.objects.filter(user=user_is_active)
for apikey in database:
apikey = apikey.apikey
for apisecret in database:
apisecret = apisecret.apisecret
#since I am making a request to an exchange endpoint I can only include one API and secret at a time . So for 1 person at a time this is why I want to run in parallel.
for a, b in zip(list(Bybitapidatas.objects.filter(user=user_is_active).values("apikey")), list(Bybitapidatas.objects.filter(user=user_is_active).values("apisecret"))):
session =spot.HTTP(endpoint='https://api-testnet.bybit.com/', api_key=a['apikey'], api_secret=b['apisecret'])
#here I check to see if they have balance to open trades if they have selected to be included.
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
for i in GET_USDT_BALANCE:
if 'USDT' in i.values():
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
idx_USDT = GET_USDT_BALANCE.index(i)
GET_USDTBALANCE = session.get_wallet_balance()['result']['balances'][idx_USDT]['free']
print(round(float(GET_USDTBALANCE),2))
#if they don't have enough balance I skip the user.
if round(float(GET_USDTBALANCE),2) < 11 :
pass
else:
session.place_active_order(
symbol="BTCUSDT",
side="Buy",
type="MARKET",
qty=10,
timeInForce="GTC"
)
How can I run this process in parallel while looping through the database to also get data for each individual user.
I am still new to coding so hope I explained that it makes sense.
I have tried multiprocessing and pools but then I get that the app has not started yet and I have to run it outside of wallet_verify is there a way to do it in wallet_verify
and when I send the Post Request.
Any help appreciated.
Filtering the Database to get Users who have set it to True
Listi - [1,3](these are user ID's Returned
processess = botactive.objects.filter(active=True).values_list('user')
listi = [row[0] for row in processess]
Get the Users from the listi and perform the action.
def wallet_verify(listi):
# print(listi)
database = Bybitapidatas.objects.filter(user = listi)
print("---------------------------------------------------- START")
for apikey in database:
apikey = apikey.apikey
print(apikey)
for apisecret in database:
apisecret = apisecret.apisecret
print(apisecret)
start_time = time.time()
session =spot.HTTP(endpoint='https://api-testnet.bybit.com/', api_key=apikey, api_secret=apisecret)
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
for i in GET_USDT_BALANCE:
if 'USDT' in i.values():
GET_USDT_BALANCE = session.get_wallet_balance()['result']['balances']
idx_USDT = GET_USDT_BALANCE.index(i)
GET_USDTBALANCE = session.get_wallet_balance()['result']['balances'][idx_USDT]['free']
print(round(float(GET_USDTBALANCE),2))
if round(float(GET_USDTBALANCE),2) < 11 :
pass
else:
session.place_active_order(
symbol="BTCUSDT",
side="Buy",
type="MARKET",
qty=10,
timeInForce="GTC"
)
print ("My program took", time.time() - start_time, "to run")
print("---------------------------------------------------- END")
return HttpResponse("Wallets verified")
Verifyt is what I use for the multiprocessing since I don't want it to run without being requested to run. also initialiser starts apps for each loop
def verifyt(request):
with ProcessPoolExecutor(max_workers=4, initializer=django.setup) as executor:
results = executor.map(wallet_verify, listi)
return HttpResponse("done")
```
Everytime I create a new instance on my ontology, something goes wrong If I try to read from the same database again.
ps - these are all part of different views on Django
This is how I am adding instances to my ontology:
# OWLREADY2
try:
myworld = World(filename='backup.db', exclusive=False)
kiposcrum = myworld.get_ontology(os.path.dirname(__file__) + '/kipo.owl').load()
except:
print("Error opening ontology")
# Sync
#--------------------------------------------------------------------------
sync_reasoner()
seed = str(time.time())
id_unico = faz_id(seed)
try:
with kiposcrum:
# here I am creating my instance, these are all strings I got from the user
kiposcrum[input_classe](input_nome + id_unico)
if input_observacao != "":
kiposcrum[input_nome + id_unico].Observacao.append(input_observacao)
sync_reasoner()
status = "OK!"
myworld.close()
myworld.save()
except:
print("Mistakes were made!")
status = "Error!"
input_nome = "Mistakes were made!"
input_classe = "Mistakes were made!"
finally:
print(input_nome + " " + id_unico)
print(input_classe)
print(status)
This is how I am reading stuff from It:
# OWLREADY2
try:
myworld = World(filename='backup.db', exclusive=False)
kiposcrum = myworld.get_ontology(os.path.dirname(__file__) + '/kipo_fialho.owl').load()
except:
print("Error")
sync_reasoner()
try:
with kiposcrum:
num_inst = 0
# gets a list of properties given an instance informed by the user
propriedades = kiposcrum[instancia].get_properties()
num_prop = len(propriedades)
myworld.close()
I am 100% able to read from my ontology, but If I try to create an instance and then try to read the database again, something goes wrong.
My Problem:
The web app I'm building relies on real-time transcription of a user's voice along with timestamps for when each word begins and ends.
Google's Speech-to-Text API has a limit of 4 minutes for streaming requests but I want users to be able to run their mic's for as long as 30 minutes if they so choose.
Thankfully, Google provides its own code examples for how to make successive requests to their Speech-to-Text API in a way that mimics endless streaming speech recognition.
I've adapted their Python infinite streaming example for my purposes (see below for my code). The timestamps provided by Google are pretty accurate but the issue is that when I exceed the streaming limit (4 minutes) and a new request is made, the timestamped transcript returned by Google's API from the new request is off by as much as 5 seconds or more.
Below is an example of the output when I adjust the streaming limit to 10 seconds (so a new request to Google's Speech-to-Text API begins every 10 seconds).
The timestamp you see printed next to each transcribed response (the 'corrected_time' in the code) is the timestamp for the end of the transcribed line, not the beginning. These timestamps are accurate for the first request but are off by ~4 seconds in the second request and ~9 seconds in the third request.
In a Nutshell, I want to make sure that when the streaming limit is exceeded and a new request is made, the timestamps returned by Google for that new request are adjusted accurately.
My Code:
To help you understand what's going on, I would recommend running it on your machine (only takes a couple of minutes to get working if you have a Google Cloud service account).
I've included more detail on my current diagnosis below the code.
#!/usr/bin/env python
"""Google Cloud Speech API sample application using the streaming API.
NOTE: This module requires the dependencies `pyaudio`.
To install using pip:
pip install pyaudio
Example usage:
python THIS_FILENAME.py
"""
# [START speech_transcribe_infinite_streaming]
import os
import re
import sys
import time
from google.cloud import speech
import pyaudio
from six.moves import queue
# Audio recording parameters
STREAMING_LIMIT = 20000 # 20 seconds (originally 4 mins but shortened for testing purposes)
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms
# Environment Variable set for Google Credentials. Put the json service account
# key in the root directory
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'YOUR_SERVICE_ACCOUNT_KEY.json'
def get_current_time():
"""Return Current Time in MS."""
return int(round(time.time() * 1000))
class ResumableMicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""
def __init__(self, rate, chunk_size):
self._rate = rate
self.chunk_size = chunk_size
self._num_channels = 1
self._buff = queue.Queue()
self.closed = True
self.start_time = get_current_time()
self.restart_counter = 0
self.audio_input = []
self.last_audio_input = []
self.result_end_time = 0
self.is_final_end_time = 0
self.final_request_end_time = 0
self.bridging_offset = 0
self.last_transcript_was_final = False
self.new_stream = True
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=self._num_channels,
rate=self._rate,
input=True,
frames_per_buffer=self.chunk_size,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)
def __enter__(self):
self.closed = False
return self
def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()
def _fill_buffer(self, in_data, *args, **kwargs):
"""Continuously collect data from the audio stream, into the buffer."""
self._buff.put(in_data)
return None, pyaudio.paContinue
def generator(self):
"""Stream Audio from microphone to API and to local buffer"""
while not self.closed:
data = []
"""
THE BELOW 'IF' STATEMENT IS WHERE THE ERROR IS LIKELY OCCURRING
This statement runs when the streaming limit is hit and a new request is made.
"""
if self.new_stream and self.last_audio_input:
chunk_time = STREAMING_LIMIT / len(self.last_audio_input)
if chunk_time != 0:
if self.bridging_offset < 0:
self.bridging_offset = 0
if self.bridging_offset > self.final_request_end_time:
self.bridging_offset = self.final_request_end_time
chunks_from_ms = round(
(self.final_request_end_time - self.bridging_offset)
/ chunk_time
)
self.bridging_offset = round(
(len(self.last_audio_input) - chunks_from_ms) * chunk_time
)
for i in range(chunks_from_ms, len(self.last_audio_input)):
data.append(self.last_audio_input[i])
self.new_stream = False
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
self.audio_input.append(chunk)
if chunk is None:
return
data.append(chunk)
# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
self.audio_input.append(chunk)
except queue.Empty:
break
yield b"".join(data)
def listen_print_loop(responses, stream):
"""Iterates through server responses and prints them.
The responses passed is a generator that will block until a response
is provided by the server.
Each response may contain multiple results, and each result may contain
multiple alternatives; Here we print only the transcription for the top
alternative of the top result.
In this case, responses are provided for interim results as well. If the
response is an interim one, print a line feed at the end of it, to allow
the next result to overwrite it, until the response is a final one. For the
final one, print a newline to preserve the finalized transcription.
"""
for response in responses:
if get_current_time() - stream.start_time > STREAMING_LIMIT:
stream.start_time = get_current_time()
break
if not response.results:
continue
result = response.results[0]
if not result.alternatives:
continue
transcript = result.alternatives[0].transcript
result_seconds = 0
result_micros = 0
if result.result_end_time.seconds:
result_seconds = result.result_end_time.seconds
if result.result_end_time.microseconds:
result_micros = result.result_end_time.microseconds
stream.result_end_time = int((result_seconds * 1000) + (result_micros / 1000))
corrected_time = (
stream.result_end_time
- stream.bridging_offset
+ (STREAMING_LIMIT * stream.restart_counter)
)
# Display interim results, but with a carriage return at the end of the
# line, so subsequent lines will overwrite them.
if result.is_final:
sys.stdout.write("FINAL RESULT # ")
sys.stdout.write(str(corrected_time/1000) + ": " + transcript + "\n")
stream.is_final_end_time = stream.result_end_time
stream.last_transcript_was_final = True
# Exit recognition if any of the transcribed phrases could be
# one of our keywords.
if re.search(r"\b(exit|quit)\b", transcript, re.I):
sys.stdout.write("Exiting...\n")
stream.closed = True
break
else:
sys.stdout.write("INTERIM RESULT # ")
sys.stdout.write(str(corrected_time/1000) + ": " + transcript + "\r")
stream.last_transcript_was_final = False
def main():
"""start bidirectional streaming from microphone input to speech API"""
client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=SAMPLE_RATE,
language_code="en-US",
max_alternatives=1,
)
streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)
mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
print(mic_manager.chunk_size)
sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n')
sys.stdout.write("End (ms) Transcript Results/Status\n")
sys.stdout.write("=====================================================\n")
with mic_manager as stream:
while not stream.closed:
sys.stdout.write(
"\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n"
)
stream.audio_input = []
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)
responses = client.streaming_recognize(streaming_config, requests)
# Now, put the transcription responses to use.
listen_print_loop(responses, stream)
if stream.result_end_time > 0:
stream.final_request_end_time = stream.is_final_end_time
stream.result_end_time = 0
stream.last_audio_input = []
stream.last_audio_input = stream.audio_input
stream.audio_input = []
stream.restart_counter = stream.restart_counter + 1
if not stream.last_transcript_was_final:
sys.stdout.write("\n")
stream.new_stream = True
if __name__ == "__main__":
main()
# [END speech_transcribe_infinite_streaming]
My Current Diagnosis
The 'corrected_time' is not being set correctly when new requests are made. This is due to the 'bridging_offset' not being set correctly. So what we need to look at is the 'generator()' method in the 'ResumableMicrophoneStream' class.
In the 'generator()' method, there is an 'if' statement which is run when the streaming limit is hit and a new request is made
if self.new_stream and self.last_audio_input:
Its purpose appears to be to take any lingering audio data that wasn't finished being transcribed before the streaming limit was hit and add it to the buffer before any new audio chunks so that it's transcribed in the new request.
It is also the responsibility of this 'if' statement to set the 'bridging offset' but I'm not entirely sure what this offset represents. All I know is that however it is being set, it is not being set accurately.
Time offset values show the beginning and the end of each spoken word
that is recognized in the supplied audio. A time offset value
represents the amount of time that has elapsed from the beginning of
the audio, in increments of 100ms.
This tells us that the offset you are receiving for each of the timestamps that you are running within your project will always make the timestamps from start to finish. That would be my guess as to why it’s causing your application problems.
I have a piece of Python Code running as a service that pulls weather data via API.
The code itself runs perfectly fine when everything is hunky dory, ie the network, but I have noticed that sometimes the WiFi on the Pi that is pulling the API data will drop and then the python codes seems to stop.
I have a small line of code providing the most basic of logs, but I would like to improve upon it greatly. The log code just provides me with the datetime.now so I can see when the last time the code ran was.
#!/usr/bin/python3
#import modules
import cymysql
from time import sleep
from urllib.request import urlopen
import json
import datetime
#set MySQl Variables
host = "localhost"
user = "xxx"
password = "xxx"
schema = "xxx"
#connect to MySQL DB
db = cymysql.connect(host, user, password, schema)
curs = db.cursor()
#set api key for DarkSky API
apikey="xxx"
# Latitude & longitude
lati="-26.20227"
longi="28.04363"
# Add units=si to get it in sensible ISO units.
url="https://api.forecast.io/forecast/"+apikey+"/"+lati+","+longi+"?units=si"
#begin infinite loop
while True:
#convert API reading to json and readable array 'weather'
meteo=urlopen(url).read()
meteo = meteo.decode('utf-8')
weather = json.loads(meteo)
#set variables for current weather
cTemp = (weather['currently']['temperature'])
cCond = (weather['currently']['summary'])
cRain1 = (weather['currently']['precipProbability'])
cRain2 = cRain1*100
cIcon = (weather['currently']['icon'])
oaSum = (weather['daily']['summary'])
#print variables - for testing purposes
#print (cTemp)
#print (cCond)
#print (cRain2)
#print (cIcon)
#print (oaSum)
#extract daily data from 'weather' array
daily = (weather['daily']['data'])
#create new arrays for daily variables
listHigh = []
listLow = []
listCond = []
listRain = []
listIcon = []
#set daily variables
for i in daily:
listHigh.append(i['temperatureHigh'])
for i in range(0,len(listHigh)):
high1 = listHigh[0]
high2 = listHigh[1]
high3 = listHigh[2]
high4 = listHigh[3]
high5 = listHigh[4]
high6 = listHigh[5]
high7 = listHigh[6]
high8 = listHigh[7]
for o in daily:
listLow.append(o['temperatureLow'])
for o in range(0,len(listLow)):
low1 = listLow[0]
low2 = listLow[1]
low3 = listLow[2]
low4 = listLow[3]
low5 = listLow[4]
low6 = listLow[5]
low7 = listLow[6]
low8 = listLow[7]
for p in daily:
listCond.append(p['summary'])
for p in range(0,len(listCond)):
cond1 = listCond[0]
cond2 = listCond[1]
cond3 = listCond[2]
cond4 = listCond[3]
cond5 = listCond[4]
cond6 = listCond[5]
cond7 = listCond[6]
cond8 = listCond[7]
for m in daily:
listRain.append(m['precipProbability'])
for m in range(0,len(listRain)):
rain1 = listRain[0]
rain2 = listRain[1]
rain3 = listRain[2]
rain4 = listRain[3]
rain5 = listRain[4]
rain6 = listRain[5]
rain7 = listRain[6]
rain8 = listRain[7]
#convert rain chance to readable percentage
prain1 = rain1*100
prain2 = rain2*100
prain3 = rain3*100
prain4 = rain4*100
prain5 = rain5*100
prain6 = rain6*100
prain7 = rain7*100
prain8 = rain8*100
for l in daily:
listIcon.append(l['icon'])
for l in range (0,len(listIcon)):
icon1 = listIcon[0]
icon2 = listIcon[1]
icon3 = listIcon[2]
icon4 = listIcon[3]
icon5 = listIcon[4]
icon6 = listIcon[5]
icon7 = listIcon[6]
icon8 = listIcon[7]
#print daily variables - for testing purposes
#print (high1)
#print (low1)
#print (cond1)
#print (prain1)
#print (icon1)
#print (high2)
#print (low2)
#print (cond2)
#print (prain2)
#print (icon2)
#update data in DataBase
try:
sql_update_query = """UPDATE weather SET current_temp = %s, cur$
varis = (cTemp, cCond, cRain2, cIcon, high1, low1, cond1, prain$
curs.execute(sql_update_query, varis)
db.commit()
except db.Error as error:
print("Error: {}".format(error))
db.rollback()
#write date to log file
with open ("/home/pi/CoRo/Projects/WeatherMan/weatherlog.txt", mode="w") as file:
file.write('Last Data was pulled at: %s' %(datetime.datetime.now()))
#set loop to sleep for 10 minutes and go again
sleep(600)
I understand that the Database Code is snipped, but it is just the variables being put in to the database, which I can see works.
However if the network disconnects, the code stops and the database is left with the last polled API data.
How would I restart the python code if the API get fails?
Thanks in advance,
You could rewrite the portion of your code that pulls the weather data as a function or separate module. This would allow you to call it only when the network connection is working. Some pseudo code below:
if network_connection:
pull_weather_data()
else:
do_something()
do_something() could be an effort to reconnect to the network, such as resetting your network adapter.
You could determine the state of the network connection by trying to ping your router or an external IP like one of Google's DNS server (8.8.8.8 or 8.8.4.4).
To avoid nested loops you could use the continue clause. For example:
while True:
if network_connection:
pull_weather_data()
else:
reset_network_connection()
time.sleep(180) # Sleep for 3 minutes.
continue
The continue will send the interpreter back to the start of the while loop. From there it will check the network connection and either pull data or reset the network connection and sleep for another 3 minutes.
Using Quernons answer above the code has been edited as follows:
#!/usr/bin/python3
#import modules
import os
import cymysql
from time import sleep
from urllib.request import urlopen
import json
import datetime
#set MySQl Variables
host = "localhost"
user = "xxx"
password = "xxx"
schema = "xxx"
#connect to MySQL DB
db = cymysql.connect(host, user, password, schema)
curs = db.cursor()
#set api key for DarkSky API
apikey="xxx"
# Latitude & longitude
lati="-26.20227"
longi="28.04363"
# Add units=si to get it in sensible ISO units not stupid Fahreneheit.
url="https://api.forecast.io/forecast/"+apikey+"/"+lati+","+longi+"?units=si"
#begin infinite loop
while True:
#function to check if there is an internet connection
def check_ping():
hostname = "8.8.8.8"
response = os.system("ping -c 1 " + hostname)
#and then check the response...
if response == 0:
pingstatus = 0
else:
pingstatus = 1
return pingstatus
networkstatus = check_ping()
#print check_ping() - for testing purposes
#print (networkstatus)
#function to pull weather data from API
def get_weather():
#insert weather data here with no changes
if networkstatus == 0:
get_weather()
else:
print ("Resetting Network Adapters")
dwnnw = 'ifconfig wlan0 down'
upnw = 'ifconfig wlan0 up'
os.system(dwnnw)
os.system(upnw)
sleep(180)
continue
I have a python script which validates data fetched from some rows in a database and then logs the errors in a different table in the same database.
The script validates each row and marks it as validated & has error = True/False depending on the validation outcome. This process is repeated for each row. With that, I thought I'd add some steroids by creating threads such that the validation for each row is done by independent threads thus reducing the time it takes to validate a batch of rows.
To my surprise, I find that the threaded script is taking slightly longer than the non-threaded one. On average to validate 1502 rows of data it takes the Non-Threaded script 1.5 seconds while the threaded script takes 2.27 seconds. That might not be much but ideally I'll be running through 2 million records at a go so that time overhead will be significant. That plus I would assume that threaded apps would finish faster! :-)
The two scripts clock the same time of about 0.01 seconds upto the point of creating threads. By this point the SQLAlchemy session is created and all the data to be validated and relations i.e foreign keys etc are fetched. From there though, the non-threaded script finishes faster. Below is my code.
1.0 None-Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
no_of_records += 1
validator = Validate()
validator.validator(no_of_records, data_list)
record_counter += lf_detail_counter
data_list = None
no_of_records = 0
print("Validated '%s': seconds %s" %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Reset counters
lf_detail_counter = 0
lf_counter += 1
#Commit The Entire Transaction.
session.commit()
print("NoThread:Finished validating %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()- process_start_time))
1.1. Validation Function for Non-Threaded Script
class Validate():
has_error = None
def validator(self, loop_length, job):
'''Validate data'''
for row_counter in range(loop_length):
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job[row_counter]
error_detail = ErrorLogDetail()
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "error message 2"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, False)
continue
else:
#Continue with extra validation
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
2.0 Threaded Script
#Alot of code goes above this to fetch the data that is passed on to the validator function
#However, the two scripts are the same upto this point in regards to time taken so didn't see need to post them.
for lf_detail_id in load_file_detail_id:
params = lf_detail_id, load_file_id, entry_number[lf_detail_counter], \
data[lf_detail_counter], template_version[lf_counter], \
load_file_detail, error, dt_file, dt_columns
data_list.append(params)
lf_detail_counter += 1
queue_size += 1
if queue_size == THREAD_LIMIT:
myqueuing(queue_size, data_list)
queue_size = 0
#spawn a pool of threads, and pass them queue instance
if queue_size > 0:
myqueuing(queue_size, data_list)
#Keep record of rows processed
record_counter += lf_detail_counter
print("Validated '%s': seconds- %s " %(filename[lf_counter], time.time()-file_start_time)) #print time it took to run'
#Mark the load file as validated
is_done = load_file.set_validation(load_file_id, True)
if is_done == False:
raise Exception ("Can't update load_file's is_validated parameter: ", lf_detail_id)
#Commit The Entire Transaction.
session.commit()
#Reset counters
lf_detail_counter = 0
lf_counter += 1
data_list = None
queue_size = 0
print("HasThread:Finished loading %s file(s) with %s record(s) in %s seconds\n" %(lf_counter, record_counter, time.time()-process_start_time)) #print time it took to run'
2.1. Threaded Validation Function
THREAD_LIMIT = 50 # This is how many threads we want
jobs = queue.Queue() # This sets up the queue object to use 5 slots
singlelock = threading.Lock() # This is a lock so threads don't print trough each other (and other reasons)
def myqueuing(queuesize, data):
'''Put the fetched data in a queue and instantiate threads to
process the queue'''
# Spawn the threads
is_valid_date("20131212", True) #Calling this here to avoid a bug in time.striptime() when threading
for x in range(queuesize):
# This is the thread class that we instantiate.
workerbee().start()
# Put stuff in queue
for i in range(queuesize):
# Block if queue is full, and wait 2 seconds. After 5s raise Queue Full error.
try:
jobs.put(data[i], block=True, timeout=2)
except:
singlelock.acquire()
print ("The queue is full !")
singlelock.lock.release()
# Wait for the threads to finish
singlelock.acquire() # Acquire the lock so we can print
print ("Waiting for threads to finish.")
singlelock.release() # Release the lock
jobs.join() # This command waits for all threads to finish.
class workerbee(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.lock = threading.Lock()
self.has_error = False
def run(self):
#try:
job = jobs.get(True,1)
load_file_detail_id, load_file_id, entry_number, data, \
template_version, load_file_detail, error, dt_file, dt_columns = job
'''Validates the data.'''
error_detail = ErrorLogDetail()
#Again please note that this part is identical for both the non-threaded and the threaded script.
#After each pass on a record, the record is marked as validated and if has_error = True
if data.strip() == "":
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.value_provided = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message1"
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
elif len(data) != int(dt_file.data_length):
error_detail.errorlog = error
error_detail.load_file_detail_id = load_file_detail_id = load_file_detail_id
error_detail.pos_row = entry_number
error_detail.pos_col = None
error_detail.column_name = None
error_detail.value_provided = None
error_detail.description = "erro message2")
session.add(error_detail)
error_detail = ErrorLogDetail()
self.has_error = True
self.set_validation(load_file_detail, load_file_detail_id, True, True)
else:
#Continue with further validation - about 5 other validation checks
#If record passes all validation then mark mark it as haserror = False
if self.has_error == False:
self.set_validation(load_file_detail, load_file_detail_id, False, True)
else:
self.has_error = False
jobs.task_done() #For the script with threading the job is marked as done. Else this does not appear in the non-threaded script
3.0. Common function for setting validation in both threaded and non-threaded
def set_validation(self, load_file_detail, load_file_detail_id, has_error, can_be_loaded):
'''Mark the record as having been validated and whether has error = True or False'''
#print("haserror and canbeloaded ", has_error, can_be_loaded)
is_done = load_file_detail.set_validation_and_error(load_file_detail_id, True, has_error, can_be_loaded)
if is_done == False:
raise Exception ("Can't update load_file_detail's is_validated parameter: ", load_file_detail_id)
3.1. Actual SQLAlchemy session for saving the validation status
def set_validation_and_error(self, load_file_detail_id, is_validated, has_error, can_be_loaded):
result = session.execute('UPDATE load_file_detail SET is_validated=%s, has_error=%s, can_be_loaded=%s WHERE id=%s' \
%(is_validated, has_error, can_be_loaded, load_file_detail_id))
So, the fetching of data to be validated is the same and both scripts take same amount of time up to that point. The validation process is the same for both scripts and saving to DB is the same i.e. Section 3.0 and 3.1 are shared by both scripts. The only difference is the validation with multiple threads. So am thinking maybe there is something about the multiple threads and SQLAlchemy that is making the app slower in threaded mode? Have I implemented the threaded function in the proper way? One of those or threading is not suitable in this scenario. Suggestions welcome.
You must create Queue for logging and add "logger" thread. So you remove locks code must be faster.
Also create DB connections in each thread to be able to get data in parallel.
Treads parallelize only C-library calls because of GIL.
For parallelize python code You must use multiprocessing.
I write test for You, describing how to process iterable:
def produce_data(amount=100000, invalid=1, valid=10):
# produce_data = sql('request').getall()
import random
id = 0
data = [True]*valid + [False]*invalid
while id < amount:
id+=1
yield (id,random.choice(data))
def validate(row):
if row[1]:
time.sleep(0.001) #set valid sql request emulation.
return True
else:
time.sleep(0.001) #set invalid sql request emulation.
return False
def single():
for row in produce_data():
validate(row)
def targeted():
import threading
for row in produce_data():
threading.Thread(target=validate,args=(row,))
Uley = 50
class Bee(object):
error=False
running = True
def __init__(self,queue,*args,**kwargs):
self.queue=queue #dont use any global variable!
# every bee must have unique db connection and session.
#self.session = db.connection().session()
# initialize it there.
return super(Bee,self).__init__(*args,**kwargs)
def run(self):
while self.running:
data=self.queue.get()
if data:
self.error = validate(data) # refactor it to self.validate(data) to be able to get cursor from self.session.
self.queue.task_done()
else:
self.queue.task_done()
break
#self.session.commit()
def treaded():
import threading,Queue
class TreadedBee(Bee,threading.Thread): pass
q = Queue.Queue()
for i in range(Uley): #bees started before data was provided.
bee=TreadedBee(q)
bee.daemon = True
bee.start()
for row in produce_data(): #you dont need to get all data to begin processing, at this place must be cursor of response.
q.put(row)
q.join()
for i in range(Uley):
q.put(None)
def forked():
from multiprocessing import Process,JoinableQueue
class ForkedBee(Bee,Process): pass
q = JoinableQueue()
for i in range(Uley):
bee=ForkedBee(q)
bee.start()
for row in produce_data():
q.put(row)
q.join()
#at this you need to kill zomBee -)
for i in range(Uley):
q.put(None)
q.close()
def pool():
from multiprocessing import Pool
pool = Pool(processes=Uley)
pool.map(validate,produce_data())
if __name__ == "__main__":
import time
s=time.time()
single()
print(time.time()-s) #109
s=time.time()
single()
print(time.time()-s) #6
s=time.time()
treaded()
print(time.time()-s) #12
s=time.time()
forked()
print(time.time()-s) #6
s=time.time()
pool()
print(time.time()-s) #4
test result:
$ python2 tgreads.py
109.779700994
5.84457302094
12.3814198971
5.97618508339
3.69856286049
targeted will flood CPU, memory and you cant provide individual connections to DB, using shared connection is not safe. If want to go in this way - you need to provide output queue and realize collector, that will communicate with DB. pool is short-code and fastest, but not friendly to initiate per-worker connections.