Python Multiprocessing within Jupyter Notebook does not work - python

I am new to multiprocessing module in Python and work with Jupyter notebooks.
When I try to run the following code I keep getting AttributeError: Can't get attribute 'load' on <module '__main__' (built-in)>
When I run the file there is no output, it just keeps loading.
import pandas as pd
import datetime
import urllib
import requests
from pprint import pprint
import time
from io import StringIO
from multiprocessing import Process, Pool
symbols = ['AAP']
start = time.time()
dflist = []
def load(date):
if date is None:
return
url = "http://regsho.finra.org/FNYXshvol{}.txt".format(date)
try:
df = pd.read_csv(url,delimiter='|')
if any(df['Symbol'].isin(symbols)):
stocks = df[df['Symbol'].isin(symbols)]
print(stocks.to_string(index=False, header=False))
# Save stocks to mysql
else:
print(f'No stock found for {date}' )
except urllib.error.HTTPError:
pass
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in range(0, numdays)
]
pool = Pool(processes=16)
pool.map(load, datelist)
pool.close()
pool.join()
print(time.time() - start)
What can I do to run this code directly from the notebook without issues?

one way to do it:
1. get load function out and create for example worker.py
2. import worker and worker.load
3.
from multiprocessing import Pool
import worker
if __name__ == '__main__':
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in
range(0, numdays)
]
pool = Pool(processes=16)
pool.map(worker.load, datelist)
pool.close()
pool.join()

Related

Multiprocessing using GetPass

Running my multiprocessing script while using the getpass function in python (import getpass): and
I keep getting this error message. I am also running this code in a .py file on my command prompt terminal, using windows 10 as well.
error message
The following is my code:
import time
import multiprocessing
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
from multiprocessing import Process, freeze_support, set_start_method
class multiprocess:
def __init__(self):
pass
def test(self, Batch_test):
pw_2 = getpass.getpass(prompt="Password", stream=False)
PML = jaydebeapi.connect('com.ibm.db2.jcc.DB2Driver', 'jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2',
['f408195', pw_2], 'C:/JDBC/db2jcc.jar')
PML = PML.cursor()
Batch_query = "select id_hstrcl_data_bch_load_frst_evnt as btch_strt, id_hstrcl_data_bch_load_last_evnt as btch_end from UDBADM.hstrcl_data_bch_load WHERE ID_HSTRCL_DATA_BCH_LOAD BETWEEN 1 and 2"
PML.execute(Batch_query)
Batch_records = PML.fetchall()
Batch_records = pd.DataFrame(Batch_records)
for ind in Batch_test:
print(ind)
first_evnt = Batch_records.iloc[ind, 0]
last_evnt = Batch_records.iloc[ind, 1]
PML_loan_Query = "select CAST(b.id_lpa_alt_loan AS INT) AS id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt between ? and ?"
PML.execute(PML_loan_Query, (first_evnt, last_evnt))
loan_records = PML.fetchall()
return loan_records
def run(self):
processes = []
for i in range(2):
p = multiprocessing.Process(target=self.test, args=(i,))
processes.append(p)
for p in processes:
p.start()
if __name__ == '__main__':
a = multiprocess()
a.run()

Why does my Python multiprocessing result not append on callback?

I can't seem to figure out why my results are not appending while using the multiprocessing package.
I've looked at many similar questions but can't seem to figure out what I'm doing wrong. This my first attempt at multiprocessing (as you might be able to tell) so I don't quite understand all the jargon in the documentation which might be part of the problem
Running this in PyCharm prints an empty list instead of the desired list of row sums.
import numpy as np
from multiprocessing import Pool
import timeit
data = np.random.randint(0, 100, size=(5, 1000))
def add_these(numbers_to_add):
added = np.sum(numbers_to_add)
return added
results = []
tic = timeit.default_timer() # start timer
pool = Pool(3)
if __name__ == '__main__':
for row in data:
pool.apply_async(add_these, row, callback=results.append)
toc = timeit.default_timer() # start timer
print(toc - tic)
print(results)
EDIT: Closing and joining pool, then printing results within the if name==main block results in the following error being raised repeatedly until I manually stop execution:
RuntimeError:
An attempt has been made to start a new process before the current process has finished its bootstrapping phase. This probably means that you are not using fork to start your child processes and you have forgotten to use the proper idiom in the main module:
if __name__ == '__main__':
freeze_support()
...
The "freeze_support()" line can be omitted if the program is not going to be frozen to produce an executable.
Code to reproduce error:
import numpy as np
from multiprocessing import Pool, freeze_support
import timeit
data = np.random.randint(0, 100, size=(5, 1000))
def add_these(numbers_to_add):
added = np.sum(numbers_to_add)
return added
results = []
tic = timeit.default_timer() # start timer
pool = Pool(3)
if __name__ == '__main__':
for row in data:
pool.apply_async(add_these, (row,), callback=results.append)
pool.close()
pool.join()
print(results)
toc = timeit.default_timer() # end timer
print(toc - tic)
I think this would be a more correct way:
import numpy as np
from multiprocessing import Pool
import timeit
data = np.random.randint(0, 100, size=(5, 1000))
def add_these(numbers_to_add):
added = np.sum(numbers_to_add)
return added
results = []
if __name__ == '__main__':
with Pool(processes=3) as pool:
for row in data:
results = pool.apply_async(add_these, (row,))
try:
print(results.get(timeout=1))
except TimeoutError:
print("Multiprocessing Timeout")

'NoneType' 'NoneType' object is not iterable

I am trying to loop through a list of symbols to get rates for various currencies via the mt5. I use the code below but i get TypeError
d[i] = [y.close for y in rates1]
TypeError: 'NoneType' object is not iterable
I can't see where im going wrong i would like to use this structure to loop through create multiple dataframe and then make a big multiindex of all pairs and time using same kind of loop. I've not been coding long.
sym = ['GBPUSD','USDJPY','USDCHF','AUDUSD','GBPJPY']
# Copying data to dataframe
d = pd.DataFrame()
for i in sym:
rates1 = mt5.copy_rates_from(i, mt5.TIMEFRAME_M1, 5)
d[i] = [y.close for y in rates1]
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 29 18:38:11 2020
#author: DanPc
"""
# -*- coding: utf-8 -*-
"""
"""
import pytz
import pandas as pd
import MetaTrader5 as mt5
import time
from datetime import datetime
from threading import Timer
import talib
import numpy as np
import matplotlib as plt
from multiprocessing import Process
import sys
server_name = "" ENTER DETAILS HERE
server_num =
password = ""
#------------------------------------------------------------------------------
def actualtime():
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
#print("date and time =", dt_string)
return str(dt_string)
#------------------------------------------------------------------------------
def sync_60sec(op):
info_time_new = datetime.strptime(str(actualtime()), '%d/%m/%Y %H:%M:%S')
waiting_time = 60 - info_time_new.second
t = Timer(waiting_time, op)
t.start()
print(actualtime)
#------------------------------------------------------------------------------
def program(symbol):
if not mt5.initialize(login=server_num, server=server_name, password=password):
print("initialize() failed, error code =",mt5.last_error())
quit()
timezone = pytz.timezone("Etc/UTC")
utc_from = datetime.now()
######### Change here the timeframe 525600
# Create currency watchlist for which correlation matrix is to be plotted
sym = ['GBPUSD','USDJPY','USDCHF','AUDUSD','GBPJPY']
# Copying data to dataframe
d = pd.DataFrame()
for i in sym:
rates1 = mt5.copy_rates_from(i, mt5.TIMEFRAME_M1, 5)
d[i] = [y.close for y in rates1]
print(rates1)
mt5.shutdown()
if not mt5.initialize():
print("initialize() failed, error code =",mt5.last_error())
quit()
# starting mt5
if not mt5.initialize(login=server_num, server=server_name, password=password):
print("initialize() failed, error code =",mt5.last_error())
quit()
#------------------------------------------------------------------------------
# S T A R T I N G M T 5
#------------------------------------------------------------------------------
authorized=mt5.login(server_num, password=password)
if authorized:
account_info=mt5.account_info()
if account_info!=None:
account_info_dict = mt5.account_info()._asdict()
df=pd.DataFrame(list(account_info_dict.items()),columns=['property','value'])
print("account_info() as dataframe:")
print(df)
else:
print(mt5.last_error)
mt5.shutdown()
#------------------------------------------------------------------------------
def trading_bot():
symbol_1 = 'EURUSD'
symbol_2 = 'EURCAD'
while True:
program(symbol_1)
program(symbol_2)
time.sleep(59.8) # it depends on your computer and ping
sync_60sec(trading_bot)
copy_rates_from returns None if there is an error. The documentation suggests calling last_error() to find out what that error is.
(And no, I don't know why copy_rates_from doesn't just raise an exception to indicate the error. Apparently, the module is a thin wrapper around a C library.)
I came to this solution that creates a dictionary of dataframes.
sym = ["GBPUSD","USDJPY","USDCHF","AUDUSD","GBPJPY"]
# Copying data to dataframe
utc_from = datetime.now()
for i in sym:
rates = {i:pd.DataFrame(mt5.copy_rates_from(i, mt5.TIMEFRAME_M1, utc_from , 60),
columns=['time', 'open', 'low', 'high', 'close', 'tick_volume', 'spread', 'real_volume']) for i in sym}

What is the fastest way to write to s3 with a glue job? (Write to csv/parquet from dynamicframe)

My current problem is that writing to s3 from a dynamic frame for small files is taking forever (more than an hour for a 100,000 line csv with ~100 columns. I am trying to write to parquet and csv, so I guess that's 2 write operations but it's still taking a long time. Is there something wrong with my code or is pyspark just usually this slow?
It should be noted that I am testing my script from a zeppelin notebook + dev endpoint (5 DPUs) to circumvent the 10 minute cold start, but I hope this isn't the reason why it's so slow. I am using spark 2.4 and python 3.
%pyspark
import boto3
import sys
import time
import uuid
from datetime import datetime
from awsglue.context import GlueContext
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import input_file_name
def some_mapping(rec):
# does something trivial
start = time.time()
print("Starting")
args = {
"key": "2000.csv",
"input_bucket": "my-input-bucket",
"output_bucket": "my-output-bucket",
}
output_path = args["output_bucket"]
connection_options = {"path": output_path}
s3 = boto3.resource("s3")
input_bucket = s3.Bucket(args["input_bucket"])
db = boto3.resource("dynamodb", region_name="us-east-1")
# Create a Glue context
glueContext = GlueContext(SparkContext.getOrCreate())
DyF = glueContext.create_dynamic_frame_from_options(
connection_type="s3",
connection_options={"paths": ["s3://{}/{}".format(args["input_bucket"], args["key"])]},
format="csv",
format_options={
"withHeader": True,
"separator": ","
}
)
mapped_DyF = DyF.map(some_mapping)
# Write to s3
end = time.time()
print("Time: ",end-start) #Transformation takes less than 30 seconds
mapped_DyF.write(connection_type="s3",
connection_options={"path": "{}/parquet".format(args["output_bucket"])},
format="parquet")
end2 = time.time() # Takes forever
print("Time: ",end2-end)
mapped_DyF.write(connection_type="s3",
connection_options={"path": "{}/csv".format(args["output_bucket"]},
format="csv")
end3 = time.time()
print("Time: ",end3-start2) # Also takes forever
print("Time: ",end-start) # Total time is > 1 hour.

Python: Add Processing Time to Output CSV File

The code below creates a pandas dataframe and outputs it into a csv file. It also clocks the processing time. How would I add the processing time to the csv file?
Thank you for your time.
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
t0 = time.clock()
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
procedure()
print (time.clock()-t0)
import time
start_time = time.time()
finish_time = time.time() - start_time
To get the processing time you can use your code like that:
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
start_time = time.time()
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
procedure()
finish_time = time.time() - start_time
print (finish_time)
Open the file in append mode to add the time to the end:
...
procedure()
elapsed = time.clock()-t0
with open('temp1.csv', 'a') as f:
f.write(str(elapsed))
Your indents are off:
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
start_time = time.time()
procedure()
finish_time = time.time() - start_time
print (finish_time)

Categories

Resources