Python: Add Processing Time to Output CSV File - python

The code below creates a pandas dataframe and outputs it into a csv file. It also clocks the processing time. How would I add the processing time to the csv file?
Thank you for your time.
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
t0 = time.clock()
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
procedure()
print (time.clock()-t0)

import time
start_time = time.time()
finish_time = time.time() - start_time
To get the processing time you can use your code like that:
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
start_time = time.time()
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
procedure()
finish_time = time.time() - start_time
print (finish_time)

Open the file in append mode to add the time to the end:
...
procedure()
elapsed = time.clock()-t0
with open('temp1.csv', 'a') as f:
f.write(str(elapsed))

Your indents are off:
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
start_time = time.time()
procedure()
finish_time = time.time() - start_time
print (finish_time)

Related

How can I use tqdm to add a progress bar in this function?

I have received from one colleague a Python script but (due to the large amount of data and the time processing) I would like to include a progress bar to check at each time its progress.
from Bio import SeqIO
from tqdm import tqdm
import csv
import pandas as pd
import re
import time
# Find code in "metadata_silva_simplified.txt" file
path_to_file = "metadata_silva_simplified.txt"
df = pd.read_csv("Name.csv")
counter = 0
Code=[]
Names=[]
Missing=[]
t = time.time()
for index in df.index:
#print("------------------------------------------------------------------------")
#print(str(counter) + "- " + df["0"][index])
name=str(df["0"][index])
with open(path_to_file,"r") as file:
for line in file:
coincident=0
ref=line[(line.find("|")+1):]
ref=ref[:(ref.find("|")-1)]
ref=ref.strip()
if name == ref:
#if ref.find(name) != -1:
coincident=1
position = line.find("|")-1
Code.append("kraken:taxid|" + line[:position])
Names.append(name)
#print("kraken:taxid|" + line[:position])
break
if coincident==0:
Missing.append(name)
counter += 1
if (counter%1000) == 0:
print(str(round(counter/5105.08))+"% completed")
Code = {'Code':Code,'Name':Names}
dfcodes = pd.DataFrame(Code)
dfcodes.to_csv("Codes_secondpart.csv", index=False)
missing = pd.DataFrame(Missing)
missing.to_csv("Missing_secondpart.csv", index=False)
elapsed = time.time() - t
print("Mean time per sample=" + str(elapsed/counter))
I thought incorporating the progress bar through the use of tqdm Python tool, but I don't know how to include in the previous function attached above to run it.
You already imported tqdm. Wrap your loop in a tqdm call and it should work:
for index in tqdm(df.index):

Generating millions of rows using faker is very slow

Am trying to generate 3.3 million fake rows using python as below snippet.
generating the file is very very slow. any help speedup this?
Python version - 3.9.7
import os, csv, time, sys
from datetime import datetime
from faker import Faker
from time import sleep
from progress.bar import Bar
os.system('clear')
sCount = "distID.in"
fake = Faker()
startTime = datetime.now()
count = sum(1 for line in open(sCount))
fakeFile = open('fakeFile.csv', 'w')
bar = Bar('Processing', max=count)
with open(sCount) as piiFile:
i=666000000
for oldID in piiFile:
i=i+1
fn = fake.first_name()
ln = fake.last_name()
dob = (f'{fake.date_of_birth()}')
fakeFile.write(f'{i},{fn},{ln},{dob},{oldID}'+'\n')
bar.next()
fakeFile.close()
bar.finish()

file modification time is different then Python time

time.time() returns bigger timestamp then modification time of a file which creates afterwords. why?
import time
import os
p = '/tmp/a3'
before_creation = time.time()
open(p, 'w')
creation = os.path.getmtime(p)
print(before_creation > creation, before_creation - creation)
print(before_creation, creation)
>>
True 0.005841493606567383
1609357964.7233727 1609357964.7175312

What is the fastest way to write to s3 with a glue job? (Write to csv/parquet from dynamicframe)

My current problem is that writing to s3 from a dynamic frame for small files is taking forever (more than an hour for a 100,000 line csv with ~100 columns. I am trying to write to parquet and csv, so I guess that's 2 write operations but it's still taking a long time. Is there something wrong with my code or is pyspark just usually this slow?
It should be noted that I am testing my script from a zeppelin notebook + dev endpoint (5 DPUs) to circumvent the 10 minute cold start, but I hope this isn't the reason why it's so slow. I am using spark 2.4 and python 3.
%pyspark
import boto3
import sys
import time
import uuid
from datetime import datetime
from awsglue.context import GlueContext
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import input_file_name
def some_mapping(rec):
# does something trivial
start = time.time()
print("Starting")
args = {
"key": "2000.csv",
"input_bucket": "my-input-bucket",
"output_bucket": "my-output-bucket",
}
output_path = args["output_bucket"]
connection_options = {"path": output_path}
s3 = boto3.resource("s3")
input_bucket = s3.Bucket(args["input_bucket"])
db = boto3.resource("dynamodb", region_name="us-east-1")
# Create a Glue context
glueContext = GlueContext(SparkContext.getOrCreate())
DyF = glueContext.create_dynamic_frame_from_options(
connection_type="s3",
connection_options={"paths": ["s3://{}/{}".format(args["input_bucket"], args["key"])]},
format="csv",
format_options={
"withHeader": True,
"separator": ","
}
)
mapped_DyF = DyF.map(some_mapping)
# Write to s3
end = time.time()
print("Time: ",end-start) #Transformation takes less than 30 seconds
mapped_DyF.write(connection_type="s3",
connection_options={"path": "{}/parquet".format(args["output_bucket"])},
format="parquet")
end2 = time.time() # Takes forever
print("Time: ",end2-end)
mapped_DyF.write(connection_type="s3",
connection_options={"path": "{}/csv".format(args["output_bucket"]},
format="csv")
end3 = time.time()
print("Time: ",end3-start2) # Also takes forever
print("Time: ",end-start) # Total time is > 1 hour.

Python Multiprocessing within Jupyter Notebook does not work

I am new to multiprocessing module in Python and work with Jupyter notebooks.
When I try to run the following code I keep getting AttributeError: Can't get attribute 'load' on <module '__main__' (built-in)>
When I run the file there is no output, it just keeps loading.
import pandas as pd
import datetime
import urllib
import requests
from pprint import pprint
import time
from io import StringIO
from multiprocessing import Process, Pool
symbols = ['AAP']
start = time.time()
dflist = []
def load(date):
if date is None:
return
url = "http://regsho.finra.org/FNYXshvol{}.txt".format(date)
try:
df = pd.read_csv(url,delimiter='|')
if any(df['Symbol'].isin(symbols)):
stocks = df[df['Symbol'].isin(symbols)]
print(stocks.to_string(index=False, header=False))
# Save stocks to mysql
else:
print(f'No stock found for {date}' )
except urllib.error.HTTPError:
pass
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in range(0, numdays)
]
pool = Pool(processes=16)
pool.map(load, datelist)
pool.close()
pool.join()
print(time.time() - start)
What can I do to run this code directly from the notebook without issues?
one way to do it:
1. get load function out and create for example worker.py
2. import worker and worker.load
3.
from multiprocessing import Pool
import worker
if __name__ == '__main__':
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in
range(0, numdays)
]
pool = Pool(processes=16)
pool.map(worker.load, datelist)
pool.close()
pool.join()

Categories

Resources