file modification time is different then Python time - python

time.time() returns bigger timestamp then modification time of a file which creates afterwords. why?
import time
import os
p = '/tmp/a3'
before_creation = time.time()
open(p, 'w')
creation = os.path.getmtime(p)
print(before_creation > creation, before_creation - creation)
print(before_creation, creation)
>>
True 0.005841493606567383
1609357964.7233727 1609357964.7175312

Related

How can I use tqdm to add a progress bar in this function?

I have received from one colleague a Python script but (due to the large amount of data and the time processing) I would like to include a progress bar to check at each time its progress.
from Bio import SeqIO
from tqdm import tqdm
import csv
import pandas as pd
import re
import time
# Find code in "metadata_silva_simplified.txt" file
path_to_file = "metadata_silva_simplified.txt"
df = pd.read_csv("Name.csv")
counter = 0
Code=[]
Names=[]
Missing=[]
t = time.time()
for index in df.index:
#print("------------------------------------------------------------------------")
#print(str(counter) + "- " + df["0"][index])
name=str(df["0"][index])
with open(path_to_file,"r") as file:
for line in file:
coincident=0
ref=line[(line.find("|")+1):]
ref=ref[:(ref.find("|")-1)]
ref=ref.strip()
if name == ref:
#if ref.find(name) != -1:
coincident=1
position = line.find("|")-1
Code.append("kraken:taxid|" + line[:position])
Names.append(name)
#print("kraken:taxid|" + line[:position])
break
if coincident==0:
Missing.append(name)
counter += 1
if (counter%1000) == 0:
print(str(round(counter/5105.08))+"% completed")
Code = {'Code':Code,'Name':Names}
dfcodes = pd.DataFrame(Code)
dfcodes.to_csv("Codes_secondpart.csv", index=False)
missing = pd.DataFrame(Missing)
missing.to_csv("Missing_secondpart.csv", index=False)
elapsed = time.time() - t
print("Mean time per sample=" + str(elapsed/counter))
I thought incorporating the progress bar through the use of tqdm Python tool, but I don't know how to include in the previous function attached above to run it.
You already imported tqdm. Wrap your loop in a tqdm call and it should work:
for index in tqdm(df.index):

Generating millions of rows using faker is very slow

Am trying to generate 3.3 million fake rows using python as below snippet.
generating the file is very very slow. any help speedup this?
Python version - 3.9.7
import os, csv, time, sys
from datetime import datetime
from faker import Faker
from time import sleep
from progress.bar import Bar
os.system('clear')
sCount = "distID.in"
fake = Faker()
startTime = datetime.now()
count = sum(1 for line in open(sCount))
fakeFile = open('fakeFile.csv', 'w')
bar = Bar('Processing', max=count)
with open(sCount) as piiFile:
i=666000000
for oldID in piiFile:
i=i+1
fn = fake.first_name()
ln = fake.last_name()
dob = (f'{fake.date_of_birth()}')
fakeFile.write(f'{i},{fn},{ln},{dob},{oldID}'+'\n')
bar.next()
fakeFile.close()
bar.finish()

How do I implement and execute threading with multiple classes in Python?

I'm very new to Python (with most of my previous programming experience being in intermediate C++ and Java) and am trying to develop a script which will read sensor data and log it to a .csv file. To do this I created separate classes for the code-- one will read the sensor data and output it to the console, while the other is supposed to take that data and log it-- and combined them together into a master script containing each class. Separately, they work perfectly, but together only the sensorReader class functions. I am trying to get each class to run in its own thread, while passing the sensor data from the first class (sensorReader) to the second class (csvWriter) as well. I've posted some of my pseudocode below, but I'd be happy to clarify any questions with the actual source code if needed.
import time
import sensorStuff
import csv
import threading
import datetime
class sensorReader:
# Initializers for the sensors.
this.code(initializes the sensors)
while True:
try:
this.code(prints the sensor data to the console)
this.code(throws exceptions)
this.code(waits 60 seconds)
class csvWriter:
this.code(fetches the date and time)
this.code(writes the headers for the excel sheet once)
while True:
this.code(gets date and time)
this.code(writes the time and one row of data to excel)
this.code(writes a message to console then repeats every minute)
r = sensorReader()
t = threading.Thread(target = r, name = "Thread #1")
t.start()
t.join
w = csvWriter()
t = threading.Thread(target = w, name = "Thread #2")
t.start()
I realize the last part doesn't really make sense, but I'm really punching above my weight here, so I'm not even sure why only the first class works and not the second, let alone how to implement threading for multiple classes. I would really appreciate it if anyone could point me in the right direction.
Thank you!
EDIT
I've decided to put up the full source code:
import time
import board
import busio
import adafruit_dps310
import adafruit_dht
import csv
import threading
import datetime
# import random
class sensorReader:
# Initializers for the sensors.
i2c = busio.I2C(board.SCL, board.SDA)
dps310 = adafruit_dps310.DPS310(i2c)
dhtDevice = adafruit_dht.DHT22(board.D4)
while True:
# Print the values to the console.
try:
global pres
pres = dps310.pressure
print("Pressure = %.2f hPa"%pres)
global temperature_c
temperature_c = dhtDevice.temperature
global temperature_f
temperature_f = temperature_c * (9 / 5) + 32
global humidity
humidity = dhtDevice.humidity
print("Temp: {:.1f} F / {:.1f} C \nHumidity: {}% "
.format(temperature_f, temperature_c, humidity))
print("")
# Errors happen fairly often with DHT sensors, and will occasionally throw exceptions.
except RuntimeError as error:
print("n/a")
print("")
# Waits 60 seconds before repeating.
time.sleep(10)
class csvWriter:
# Fetches the date and time for future file naming and data logging operations.
starttime=time.time()
x = datetime.datetime.now()
# Writes the header for the .csv file once.
with open('Weather Log %s.csv' % x, 'w', newline='') as f:
fieldnames = ['Time', 'Temperature (F)', 'Humidity (%)', 'Pressure (hPa)']
thewriter = csv.DictWriter(f, fieldnames=fieldnames)
thewriter.writeheader()
# Fetches the date and time.
while True:
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
# Writes incoming data to the .csv file.
with open('Weather Log %s.csv', 'a', newline='') as f:
fieldnames = ['TIME', 'TEMP', 'HUMI', 'PRES']
thewriter = csv.DictWriter(f, fieldnames=fieldnames)
thewriter.writerow({'TIME' : current_time, 'TEMP' : temperature_f, 'HUMI' : humidity, 'PRES' : pres})
# Writes a message confirming the data's entry into the log, then sets a 60 second repeat cycle.
print("New entry added.")
time.sleep(10.0 - ((time.time() - starttime) % 10.0)) # Repeat every ten seconds.
r = sensorReader()
t = threading.Thread(target = r, name = "Thread #1")
t.start()
t.join
w = csvWriter()
t = threading.Thread(target = w, name = "Thread #2")
t.start()
It would work better structured like this. If you put the first loop in a function, you can delay its evaluation until you're ready to start the thread. But in a class body it would run immediately and you never get to the second definition.
def sensor_reader():
# Initializers for the sensors.
this.code(initializes the sensors)
while True:
try:
this.code(prints the sensor data to the console)
except:
print()
this.code(waits 60 seconds)
threading.Thread(target=sensor_reader, name="Thread #1", daemon=True).start()
this.code(fetches the date and time)
this.code(writes the headers for the excel sheet once)
while True:
this.code(gets date and time)
this.code(writes the time and one row of data to excel)
this.code(writes a message to console then repeats every minute)
I made it a daemon so it will stop when you terminate the program. Note also that we only needed to create one thread, since we already have the main thread.

What is the fastest way to write to s3 with a glue job? (Write to csv/parquet from dynamicframe)

My current problem is that writing to s3 from a dynamic frame for small files is taking forever (more than an hour for a 100,000 line csv with ~100 columns. I am trying to write to parquet and csv, so I guess that's 2 write operations but it's still taking a long time. Is there something wrong with my code or is pyspark just usually this slow?
It should be noted that I am testing my script from a zeppelin notebook + dev endpoint (5 DPUs) to circumvent the 10 minute cold start, but I hope this isn't the reason why it's so slow. I am using spark 2.4 and python 3.
%pyspark
import boto3
import sys
import time
import uuid
from datetime import datetime
from awsglue.context import GlueContext
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql.functions import input_file_name
def some_mapping(rec):
# does something trivial
start = time.time()
print("Starting")
args = {
"key": "2000.csv",
"input_bucket": "my-input-bucket",
"output_bucket": "my-output-bucket",
}
output_path = args["output_bucket"]
connection_options = {"path": output_path}
s3 = boto3.resource("s3")
input_bucket = s3.Bucket(args["input_bucket"])
db = boto3.resource("dynamodb", region_name="us-east-1")
# Create a Glue context
glueContext = GlueContext(SparkContext.getOrCreate())
DyF = glueContext.create_dynamic_frame_from_options(
connection_type="s3",
connection_options={"paths": ["s3://{}/{}".format(args["input_bucket"], args["key"])]},
format="csv",
format_options={
"withHeader": True,
"separator": ","
}
)
mapped_DyF = DyF.map(some_mapping)
# Write to s3
end = time.time()
print("Time: ",end-start) #Transformation takes less than 30 seconds
mapped_DyF.write(connection_type="s3",
connection_options={"path": "{}/parquet".format(args["output_bucket"])},
format="parquet")
end2 = time.time() # Takes forever
print("Time: ",end2-end)
mapped_DyF.write(connection_type="s3",
connection_options={"path": "{}/csv".format(args["output_bucket"]},
format="csv")
end3 = time.time()
print("Time: ",end3-start2) # Also takes forever
print("Time: ",end-start) # Total time is > 1 hour.

Python: Add Processing Time to Output CSV File

The code below creates a pandas dataframe and outputs it into a csv file. It also clocks the processing time. How would I add the processing time to the csv file?
Thank you for your time.
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
t0 = time.clock()
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
procedure()
print (time.clock()-t0)
import time
start_time = time.time()
finish_time = time.time() - start_time
To get the processing time you can use your code like that:
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
start_time = time.time()
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
procedure()
finish_time = time.time() - start_time
print (finish_time)
Open the file in append mode to add the time to the end:
...
procedure()
elapsed = time.clock()-t0
with open('temp1.csv', 'a') as f:
f.write(str(elapsed))
Your indents are off:
import pandas as pd
import csv
import time
def procedure():
time.sleep(0)
y=pd.DataFrame({'one':[1,2,3,4],'two':[3,5,7,5],'three':[2,3,4,9],'four':[4,3,1,0],})
y.to_csv('temp1.csv')
start_time = time.time()
procedure()
finish_time = time.time() - start_time
print (finish_time)

Categories

Resources