How to properly implement threading while writing out to a csv? - python

I'm pulling commit data from the Gerrit API, and the commit number is in the 226,000 range. Where I have to make a request to an endpoint for each and every commit, this is understandable taking a long time. I was wondering how I could best implement threading into my current process.
I have two classes, a Project class, which drills down and retrieves all commits associated with it, and saves them out as a Commit object that contains all the information necessary to then loop through and get the json associated with it. I am pulling them all into a big list, and then iterating through to call the get_data and write_data methods.
class Project(object):
def __init__(self, name):
self.name = name
self.commits = []
def add_commits(self, changes_list):
for change in changes_list:
change_id=change['change_id'],
revision_list=change['revisions']
self.commits.extend([Commit(rid, change_id)
for rid in revision_list.keys()])
def return_results(self, ger_obj, start=0):
self.ger = ger_obj
while True:
endpoint = (r'/changes/?q=project:{project}&o=ALL_REVISIONS&'
r'S={num}'.format(
project=self.name,
num=start
))
logging.info('Endpoint: {}'.format(endpoint))
try:
changes = ger_obj.get(endpoint)
self.add_commits(changes_list=changes)
except HTTPError:
break
start += 500
try:
if not changes[-1].get('_more_changes'):
break
except IndexError:
break
class Commit(object):
def __init__(self, rev_id, change_id):
self.rev_id = rev_id
self.change_id = change_id
def get_data(self, ger_obj):
endpoint = (r'/changes/{c_id}/revisions/{r_id}/commit'.format(
c_id=self.change_id[0],
r_id=self.rev_id
))
try:
self.data = ger_obj.get(endpoint)
except HTTPError as e:
logging.warning('Endpoint: {} did not return data'.format(
endpoint
))
else:
self.data['commitid'] = self.data.get('commit')
self.data['name'] = self.data.get('committer')['name']
self.data['email'] = self.data.get('committer')['email']
self.data['date'] = self.data.get('committer')['date']
hash = md5()
hash.update(json.dumps(self.data).encode('utf-8'))
self.data['etl_checksum_md5'] = hash.hexdigest()
self.data['etl_process_status'] = ETL_PROCESS_STATUS
self.data['etl_datetime_local'] = ETL_DATETIME_LOCAL
self.data['etl_pdi_version'] = ETL_PDI_VERSION
self.data['etl_pdi_build_version'] = ETL_PDI_BUILD_VERSION
self.data['etl_pdi_hostname'] = ETL_PDI_HOSTNAME
self.data['etl_pdi_ipaddress'] = ETL_PDI_IPADDRESS
self.data['message'] = self.data['message'].replace('\n', ' ').replace('|', '[pipe]')
def write_data(self, writer):
writer.writerow(self.data)
I'm thinking that the best place to implement the threads is once I have all the commits in a list and am ready to iterate over them:
projects = [Project(value['id']) for value in project_data.values()]
for project in projects[:10]:
if project.name in bad_names.keys():
project.name = bad_names[project.name]
project.return_results(rest)
all_commits.extend(project.commits)
fieldnames = get_fieldnames(
'ods_gerrit.staging_gerrit_commits',
REDSHIFT_POSTGRES_INFO)
with open('testfile.csv', 'wb') as outf:
writer = DictWriter(
outf,
fieldnames=fieldnames,
extrasaction='ignore',
delimiter='|'
)
# Implement Threading?
for commit in all_commits:
commit.get_data(rest)
try:
commit.write_data(writer=writer)
except AttributeError:
continue
except Exception:
print commit.data, 'caused an exception.'
continue
I've read a few threading tutorials, and am unsure as to how to properly do this. I'm particularly worried about overwriting data due to improper locking.

Related

How to apply changes to ontology saved on SQLite Database?

Everytime I create a new instance on my ontology, something goes wrong If I try to read from the same database again.
ps - these are all part of different views on Django
This is how I am adding instances to my ontology:
# OWLREADY2
try:
myworld = World(filename='backup.db', exclusive=False)
kiposcrum = myworld.get_ontology(os.path.dirname(__file__) + '/kipo.owl').load()
except:
print("Error opening ontology")
# Sync
#--------------------------------------------------------------------------
sync_reasoner()
seed = str(time.time())
id_unico = faz_id(seed)
try:
with kiposcrum:
# here I am creating my instance, these are all strings I got from the user
kiposcrum[input_classe](input_nome + id_unico)
if input_observacao != "":
kiposcrum[input_nome + id_unico].Observacao.append(input_observacao)
sync_reasoner()
status = "OK!"
myworld.close()
myworld.save()
except:
print("Mistakes were made!")
status = "Error!"
input_nome = "Mistakes were made!"
input_classe = "Mistakes were made!"
finally:
print(input_nome + " " + id_unico)
print(input_classe)
print(status)
This is how I am reading stuff from It:
# OWLREADY2
try:
myworld = World(filename='backup.db', exclusive=False)
kiposcrum = myworld.get_ontology(os.path.dirname(__file__) + '/kipo_fialho.owl').load()
except:
print("Error")
sync_reasoner()
try:
with kiposcrum:
num_inst = 0
# gets a list of properties given an instance informed by the user
propriedades = kiposcrum[instancia].get_properties()
num_prop = len(propriedades)
myworld.close()
I am 100% able to read from my ontology, but If I try to create an instance and then try to read the database again, something goes wrong.

Python for/while loop

Today i am working on a project about incoming phone calls being transcripted and getting saved into text files, but i am also kinda new to python and python loops.
I want to loop over a SQL server column and let each row loop trough the azure Speech to text service i use (all of the phonecall OID's). I have been stuck on this problem for a couple days now so i thought i might find some help here.
import azure.cognitiveservices.speech as speechsdk
import time
from os import path
from pydub import AudioSegment
import requests
import hashlib
import sys
import os.path
import pyodbc
databaseName = '*'
username = '*'
password = '*'
server = '*'
driver = '*'
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
while row:
array = [(int(row[1]))]
row = cursor.fetchone()
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
Telefoongesprek = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
with open("Telefoongesprek.mp3", "wb") as f:
f.write(Telefoongesprek.content)
src = "Telefoongesprek.mp3"
dst = "Telefoongesprek.wav"
sound = AudioSegment.from_file(src)
sound.export(dst, format="wav")
def speech_recognize_continuous_from_file():
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(filename="Telefoongesprek.wav")
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
speech_recognize_continuous_from_file()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
everything works apart form each other but i just dont know how to place the loop and witch one i should use (For/While loop). right here im trying to loop over an array but i dont this this is correct.
Error message: Decoding failed. ffmpeg returned error code: 1
[mp3 # 000001cb8c57e0o0] Failed to read frame size: could not seek to 1073.
which i am pretty sure means that my azure function can't find an mp3 file, what means that the "Mp3 to Wav" convert doesn't work.
Thanks in advance!
If I understand your question, you have a database with lots of phone call details. One of the field value in each row is used to create the associated mp3 file. You want to do speech to text using azure on each of the mp3 file you have in your database.
So you can do it in two ways:
Iterate though all rows in the database and create all the associted files into a folder in the local disk with the OID as your filename.
Then write another loop to iterate through this folder and send the files for transcription to Azure Speech to Text service.
The other technique is to do everything in a single loop like the way you have shown which will require some corrections.
Ok, so now that part is clear, we can go into the speech to text part. So azure allow you to send the compressed format for transcription, which means you actually don't need to convert it into wav file.
Please have a look at the modified code below with the changes:
# code snippet borrowed from azure samples
def speech_recognize_continuous_from_file(filename):
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
def __init__(self, filename: str):
super().__init__()
self._file_h = open(filename, "rb")
def read(self, buffer: memoryview) -> int:
try:
size = buffer.nbytes
frames = self._file_h.read(size)
buffer[:len(frames)] = frames
return len(frames)
except Exception as ex:
print('Exception in `read`: {}'.format(ex))
raise
def close(self) -> None:
print('closing file')
try:
self._file_h.close()
except Exception as ex:
print('Exception in `close`: {}'.format(ex))
raise
# Creates an audio stream format. For an example we are using MP3 compressed file here
compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
callback = BinaryFileReaderCallback(filename=filename)
stream = speechsdk.audio.PullAudioInputStream(stream_format=compressed_format, pull_stream_callback=callback)
speech_config = speechsdk.SpeechConfig(subscription="*", region="*")
speech_config.speech_recognition_language = "nl-NL"
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Creates a speech recognizer using a file as audio input, also specify the speech language
speech_recognizer = speechsdk.SpeechRecognizer(speech_config, audio_config)
done = False
def stop_cb(evt):
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
all_results = []
def handle_final_result(evt):
all_results.append(evt.result.text)
speech_recognizer.recognized.connect(handle_final_result)
speech_recognizer.session_started.connect(handle_final_result)
speech_recognizer.session_stopped.connect(handle_final_result)
speech_recognizer.canceled.connect(handle_final_result)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
print(all_results)
telefoongesprek = str(all_results)
filename = f"C:\\Users\\Beau\\Contact-verkeer\\contact-verkeer\\telefoon\\STT Transcriptions\\Telefoongesprek#{OID}.txt"
file = open(filename, "w")
file.write(telefoongesprek)
file.close()
try:
CONNECTION_STRING = 'DRIVER='+driver+';SERVER='+server+';DATABASE='+databaseName+';UID='+username+';PWD='+ password
conn = pyodbc.connect(CONNECTION_STRING)
cursor = conn.cursor()
storedproc = "* = *'"
cursor.execute(storedproc)
row = cursor.fetchone()
# loop through the rows
while row:
array = [(int(row[1]))]
i = 0
while i<len(array):
OID = (array[i])
i = i + 1
print(OID)
string = f"{OID}*"
encoded = string.encode()
result = hashlib.sha256(encoded)
resultHash = (result.hexdigest())
telefoongesprek_response = requests.get(f"*{OID}", headers={f"api-key":f"{resultHash}"})
# save the file to local disk as mp3
with open("Telefoongesprek.mp3", "wb") as f:
f.write(telefoongesprek_response.content)
# do the speech to text on the mp3 file
speech_recognize_continuous_from_file(f.name)
# fetch the next row
row = cursor.fetchone()
cursor.close()
del cursor
conn.close()
except Exception as e:
print("Error: %s" % e)
I haven't tested this full code as i don't have the db connections with me. Please fell free to modify for your use case and let me know if you have any issues.

Persist datetime values for use with gviz api

I have an python application that collects data from an MQTT broker and presents it to a website via the gviz python API:
DESCRIPTION= [ \
('Zeit', 'datetime'), \
('Temperatur', 'number'), \
('Feuchtigkeit', 'number'),\
('Batterie', 'number') \
]
def sendAnswer(conn):
#protect against the data supplier
Mutex.acquire()
Trans = deepcopy(DatArr)
Mutex.release()
#create and populate the DataTable
data_table = gviz_api.DataTable(DESCRIPTION)
data_table.LoadData(Trans)
Answer = data_table.ToJSon()
#send ti to the webserver
lng = len(Answer)
try:
conn.sendall(bytes("L{:06d};".format(lng),"UTF-8"))
conn.sendall(bytes(Answer,"UTF-8"))
except BaseException:
# if anything goes wrong, try again next time
pass
def on_message(client, userdata, message):
global Last, DatArr
#get the data from the broker
cur = json.loads(str(message.payload, encoding='utf-8'))
if cur == Last and len(DatArr)>2 : return
now = datetime.now()
# protect against the webserver
Mutex.acquire()
#add the data
DatArr.append([now, cur["temp"], cur["hum"], cur["bat"]])
#cleanup old values
Last = cur
for i in range(len(DatArr)):
if now - DatArr[0][0] > timedelta(days=1):
DatArr.pop(0)
else:
break
Mutex.release()
This works, but instead of keeping the data in the python variable I want to persist it in a file (preferrably JSON). But I cannot JSON.dump() a datetime variable and I cannot .LoadData() a string into a gviz DataTable. The python gviz also lacks an "addRow()". Any suggestions?
Much thanks in advance!
Based on the answers to this question: JSON datetime between Python and JavaScript
I found a solution and implemented it in a python module:
import json
import datetime
class DateTimeJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return dict(nested_datetime=obj.isoformat())
else:
return super(DateTimeJSONEncoder, self).default(obj)
def datetime_decoder(d):
if len(d) == 1 and 'nested_datetime' in d:
return datetime.datetime.strptime(d['nested_datetime'], '%Y-%m-%dT%H:%M:%S.%f')
result = {}
for prop in d:
if isinstance(d[prop], dict):
result[prop] = datetime_decoder(d[prop])
else:
result[prop] = d[prop]
return result
The class and the function go as named parameters into the json.dump and json.load functions like this:
DatArr = json.load(DatFile, object_hook=djson.datetime_decoder)
and
json.dump(DatArr, DatFile, cls=djson.DateTimeJSONEncoder)
This persists the formerly global variable DatArr in the json file DatFile.
Thanks to all the posters to the above question for providing the information.

Method cannot access class variable of different class

I am writing an algorithm in Python that is supposed to sort children (out of a database table) into one of their chosen kindergarten wishes (also out of a database table) following certain criteria on who to guarantee a place in their chosen kindergarten first. For this I first wrote a KitaDAO class to link the programme to the database and fetch information out of certain tables, saving them as an object.
import pymysql
import json
from Kita import Kita
from Kind import Kind
from Element import Element
class KitaDAO():
def __init__(self):
self.db = pymysql.connect("localhost","projekt","projekt","kita" )
self.cursor = self.db.cursor()
self.kitaList = []
self.kinderList = []
def getKitas(self):
self.sql = "SELECT * FROM kitas"
try:
self.cursor.execute(self.sql)
self.results = self.cursor.fetchall()
for row in self.results:
thisKita = Kita(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8])
self.kitaList.append(thisKita)
except Exception as e:
print (e)
return self.kitaList
def getWarteliste(self):
self.sql = "SELECT * FROM warteliste"
self.warteliste = []
try:
self.cursor.execute(self.sql)
self.results = self.cursor.fetchall()
for row in self.results:
thisElement = Element(row[0],row[1],row[2],row[3],row[4],row[5],row[6])
self.warteliste.append(thisElement)
except Exception as e:
print (e)
return self.warteliste
def getKinder(self):
self.sql = "SELECT * FROM kinderprofil"
try:
self.cursor.execute(self.sql)
self.results = self.cursor.fetchall()
for row in self.results:
thisKind = Kind(row[0],row[1],row[2],row[3],row[4],row[5],row[6])
self.kinderList.append(thisKind)
except Exception as e:
print (e)
return self.kinderList
def getKindOnWarteliste(self,kita,wunschnummer):
self.kinderList = []
self.warteliste = []
self.warteliste = self.getWarteliste()
if (wunschnummer == 1):
for i in self.warteliste:
if (kita == i.getWunsch1()):
self.kinderList.append(i.getKind())
elif (wunschnummer == 2):
for i in self.warteliste:
if (kita == i.getWunsch2()):
self.kinderList.append(i.getKind())
elif (wunschnummer == 3):
for i in self.warteliste:
if (kita == i.getWunsch3()):
self.kinderList.append(i.getKind())
else:
print("Error: Eine ungültige Wunschnummer wurde übergeben.")
return self.kinderList
If needed I can also post the classes Element, Kind and Kita in here but they basically only contain an __init__ method and if needed a get method. They also work, I have tested that before.
My problem is now, that in my main class called Sortierung I made thisDAO an instance of KitaDAO and want to use it to call methods and such, as normally. Sadly the class variable thisDAO is not accessible in a method of Sortierung. So basically this code has the response:
File "Sortierung.py", line 3, in <module> class Sortierung():
File "Sortierung.py", line 30, in Sortierung checkBetreuung(i,warteliste)
File "Sortierung.py", line 11, in checkBetreuung KinderObjektListe = thisDAO.getKinder()
nameError: name 'thisDAO' is not defined
I marked the lines in the code under here.
from KitaDAO import KitaDAO
class Sortierung(): #---------- This is line 3
kitas = []
thisDAO = KitaDAO()
kitas = thisDAO.getKitas()
def checkBetreuung(kita,kinderIDListe):
KinderObjektListe = []
KinderObjektListe = thisDAO.getKinder() #---------This is line 11
#left something out here that was irrelevant
for x in range(1,4):
for i in kitas:
warteliste = []
warteliste = thisDAO.getKindOnWarteliste(i.getID,x)
checkBetreuung(i,warteliste) #-------------This is line 30
Also BTW I am German that is why the variable names are all in German. Sorry :)
You don't need the Sortierung class at all (this is not Java; not everything needs to be encapsulated in a class) – the root problem is thisDAO ends up being a class attribute of it.
Something like
from KitaDAO import KitaDAO
thisDAO = KitaDAO()
kitas = thisDAO.getKitas()
def checkBetreuung(kita, kinderIDListe):
KinderObjektListe = thisDAO.getKinder()
for x in range(1,4):
for i in kitas:
warteliste = thisDAO.getKindOnWarteliste(i.getID(), x)
checkBetreuung(i, warteliste)
should do the trick, barring any other problems.

writing to files: switch to new file after X MB file capacity

I have millions of domains which I will send WHOIS query and record WHOIS response on some .txt file.
I would like to set maximum capacity for a single .txt output file. For example, let's say I started recording responses on out0.txt. I want to switch to out1.txt if out0.txt is >= 100mb. Same thing goes for out1.txt, if out1.txt>=100mb then start writing to out2.txtand so on.
I know that I can do if checks after each insertion, but I want my code to be fast: i.e. I thought if checks at each domain can slow down my code. (It will asynchronously query millions of domains).
I imagined a try-except block could solve my issue here, like this:
folder_name = "out%s.txt"
folder_number = 0
folder_name = folder_name % folder_number
f = open(folder_name, 'w+')
for domain in millions_of_domains:
try:
response_json = send_whois_query(domain)
f.write(response_json)
except FileGreaterThan100MbException:
folder_number += 1
folder_name = folder_name % folder_number
f = open(folder_name, 'w+')
f.write(response_json)
Any suggestions will be appreciated. Thank you for your time.
You can create a wrapper object that tracks how much data has been written, and opens a new file if you reached a limit:
class MaxSizeFileWriter(object):
def __init__(self, filenamepattern, maxdata=2**20, # default 1Mb
start=0, mode='w', *args, **kwargs):
self._pattern = filenamepattern
self._counter = start
self._mode = mode
self._args, self._kwargs = args, kwargs
self._max = maxdata
self._openfile = None
self._written = 0
def _open(self):
if self._openfile is not None:
filename = self._pattern.format(self._counter)
self._counter += 1
self._openfile = open(filename, mode=self._mode, *self._args, **self._kwargs)
def _close(self):
if self._openfile is not None:
self._openfile.close()
def __enter__(self):
return self
def __exit__(self, *args, **kwargs):
if self._openfile is not None:
self._openfile.close()
def write(self, data):
if self._written + len(data) > self._max:
# current file too full to fit data too, close it
# This will trigger a new file to be opened.
self._close()
self._open() # noop if already open
self._openfile.write(data)
self._written += len(data)
The above is a context manager, and can be used just like a regular file. Pass in a filename with a {} placeholder for the number to be inserted into:
folder_name = "out{}.txt"
with MaxSizeFileWriter(folder_name, maxdata=100 * 2**10) as f:
for domain in millions_of_domains:
response_json = send_whois_query(domain)
f.write(response_json)

Categories

Resources