Generating millions of rows using faker is very slow - python

Am trying to generate 3.3 million fake rows using python as below snippet.
generating the file is very very slow. any help speedup this?
Python version - 3.9.7
import os, csv, time, sys
from datetime import datetime
from faker import Faker
from time import sleep
from progress.bar import Bar
os.system('clear')
sCount = "distID.in"
fake = Faker()
startTime = datetime.now()
count = sum(1 for line in open(sCount))
fakeFile = open('fakeFile.csv', 'w')
bar = Bar('Processing', max=count)
with open(sCount) as piiFile:
i=666000000
for oldID in piiFile:
i=i+1
fn = fake.first_name()
ln = fake.last_name()
dob = (f'{fake.date_of_birth()}')
fakeFile.write(f'{i},{fn},{ln},{dob},{oldID}'+'\n')
bar.next()
fakeFile.close()
bar.finish()

Related

How can I use tqdm to add a progress bar in this function?

I have received from one colleague a Python script but (due to the large amount of data and the time processing) I would like to include a progress bar to check at each time its progress.
from Bio import SeqIO
from tqdm import tqdm
import csv
import pandas as pd
import re
import time
# Find code in "metadata_silva_simplified.txt" file
path_to_file = "metadata_silva_simplified.txt"
df = pd.read_csv("Name.csv")
counter = 0
Code=[]
Names=[]
Missing=[]
t = time.time()
for index in df.index:
#print("------------------------------------------------------------------------")
#print(str(counter) + "- " + df["0"][index])
name=str(df["0"][index])
with open(path_to_file,"r") as file:
for line in file:
coincident=0
ref=line[(line.find("|")+1):]
ref=ref[:(ref.find("|")-1)]
ref=ref.strip()
if name == ref:
#if ref.find(name) != -1:
coincident=1
position = line.find("|")-1
Code.append("kraken:taxid|" + line[:position])
Names.append(name)
#print("kraken:taxid|" + line[:position])
break
if coincident==0:
Missing.append(name)
counter += 1
if (counter%1000) == 0:
print(str(round(counter/5105.08))+"% completed")
Code = {'Code':Code,'Name':Names}
dfcodes = pd.DataFrame(Code)
dfcodes.to_csv("Codes_secondpart.csv", index=False)
missing = pd.DataFrame(Missing)
missing.to_csv("Missing_secondpart.csv", index=False)
elapsed = time.time() - t
print("Mean time per sample=" + str(elapsed/counter))
I thought incorporating the progress bar through the use of tqdm Python tool, but I don't know how to include in the previous function attached above to run it.
You already imported tqdm. Wrap your loop in a tqdm call and it should work:
for index in tqdm(df.index):

Continue script after schedule until python

I would like to continue my python script after the schedule.every(x).seconds.until(x).do(job), but my program end the function until the x time and doesn't keep going after.
Here is my program :
import requests
import json
import pickle
import schedule
import sys
import time
import pandas as pd
from datetime import datetime, timedelta
from threading import Timer
import matplotlib
from datetime import datetime
import matplotlib.pyplot as plt
from colorama import Fore, Back, Style
import math
import numpy as np
import os
os.environ["PATH"] += os.pathsep + '/Library/TeX/texbin'
key = 'MY_API_KEY'
adress = 'https://api.openweathermap.org/data/2.5/weather'
params = {'appid':key, 'q': 'Lausanne', 'units':'metric'}
def somme(tab):
s=0
for i in range(len(tab)):
s=s+tab[i]
return s
def moyenne(tab):
return somme(tab)/len(tab)
tab=[]
tab2=[]
def function(tab,tab2):
response = requests.get(adress, params=params)
weather = response.json()
temp = weather['main']['temp']
print(temp)
tab.append(temp)
now = datetime.now()
time = now.strftime("%H:%M:%S")
tab2.append(time)
print(tab)
print(tab2)
print(moyenne(tab))
function(tab,tab2)
schedule.every(10).seconds.until("20:00").do(function, tab, tab2)
while True :
schedule.run_pending()
time.sleep(1)
fig = plt.figure(1,figsize=(9, 7))
ax = fig.add_subplot()
ax.set_xlabel('x')
ax.set_ylabel('y')
plt.plot(tab2,tab, 'b.')
plt.show()
I would like that my program plot the graph after it had done the schedule. Is it possible ?
Thank you for your help !
EDIT :
Thank you to Tim Roberts for the answer in the comments !
I just have to change my loop like this while datetime.now().hour < 20:

the DataFrameClinet class of python's package for influxdb uploads only the last line from the dataframe

i am trying to use python's package for influxdb to upload dataframe into the database
i am using the write_points class to write point into the database as given in the documentation(https://influxdb-python.readthedocs.io/en/latest/api-documentation.html)
every time i try to use the class it only updates the last line of the dataframe instead of the complete dataframe.
is this a usual behavior or there is some problem here?
given below is my script:
from influxdb import InfluxDBClient, DataFrameClient
import pathlib
import numpy as np
import pandas as pd
import datetime
db_client = DataFrameClient('dbserver', port, 'username', 'password', 'database',
ssl=True, verify_ssl=True)
today = datetime.datetime.now().strftime('%Y%m%d')
path = pathlib.Path('/dir1/dir/2').glob(f'pattern_to_match*/{today}.filename.csv')
for file in path:
order_start = pd.read_csv(f'{file}')
if not order_start.empty:
order_start['data_line1'] = (order_start['col1'] - \
order_start['col2'])*1000
order_start['data_line2'] = (order_start['col3'] - \
order_start['col4'])*1000
d1 = round(order_start['data_line1'].quantile(np.arange(0,1.1,0.1)), 3)
d2 = round(order_start['data_line2'].quantile(np.arange(0,1.1,0.1)), 3)
out_file = pd.DataFrame()
out_file = out_file.append(d1)
out_file = out_file.append(d2)
out_file = out_file.T
out_file.index = out_file.index.set_names(['percentile'])
out_file = out_file.reset_index()
out_file['percentile'] = out_file.percentile.apply(lambda x: f'{100*x:.0f}%')
out_file['tag_col'] = str(file).split('/')[2]
out_file['time'] = pd.to_datetime('today').strftime('%Y%m%d')
out_file = out_file.set_index('time')
out_file.index = pd.to_datetime(out_file.index)
db_client.write_points(out_file, 'measurement', database='database',
retention_policy='rp')
can anyone please help?

convert linux python multiprocessing to windows

I would like to use this Linux Python script in Windows Python.
how to rewrite it ? The part to be rewritten in multiprocessing part.
from __future__ import print_function
from collections import Counter
import glob
import multiprocessing
import os
import re
import sys
import time
def create_data(filepath):
...
return values
filepaths = glob.glob('*/*.txt')
num_tasks = len(filepaths)
p = multiprocessing.Pool()
results = p.imap(create_data, filepaths)
while (True):
completed = results._index
print("\r--- Completed {:,} out of {:,}".format(completed, num_tasks), end='')
sys.stdout.flush()
time.sleep(1)
if (completed == num_tasks): break
p.close()
p.join()
df_full = pd.DataFrame(list(results))
print()
thanks for your help.

Python: Looping a certain amount of time in order different process

I made a script which is suppose to use Tkinter to allow to choose and load files and store their content in different objects and then process each of these documents.
I would like to make the script able to process only a certain amount of documents determined by a question (the value is stored under "File_number")
For exemple: if at the question "how many files do you want to compare?"
the user enter 3
I would like the tkinter openfile window to ask only for 3 files then keep going
I am using the If Else statement like below
but it doesn't seem to work well and the code is really not pythonic.
Is there a better/shorter way to perform the same?
Thanks
My script look like this
import pandas as pd
from pandas import *
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import pylab
import pandas.io.data
import os
import Tkinter
from Tkinter import *
import tkFileDialog
import tkSimpleDialog
from tkFileDialog import askopenfilename
import sys
# Set up GUI
root = Tkinter.Tk(); root.withdraw()
# Prompt for user info
File_number = tkSimpleDialog.askinteger("File number", "How many files do you want to compare?")
# Prompt for file explorer
# Also extract the file_name
process_a = 0
if process_a = File_number:
break
else:
process_a = process_a + 1
fileloc1 = tkFileDialog.askopenfilename(parent=root, title='Choose file 1')
fileloc1_name_clean = os.path.splitext(fileloc1)[0]
fileloc1_name = os.path.basename(fileloc1_name_clean)
if process_a = File_number:
break
else:
process_a = process_a + 1
fileloc2 = tkFileDialog.askopenfilename(parent=root, title='Choose file 2')
fileloc2_name_clean = os.path.splitext(fileloc2)[0]
fileloc2_name = os.path.basename(fileloc2_name_clean)
if process_a = File_number:
break
else:
process_a = process_a + 1
fileloc3 = tkFileDialog.askopenfilename(parent=root, title='Choose file 3')
fileloc3_name_clean = os.path.splitext(fileloc3)[0]
fileloc3_name = os.path.basename(fileloc3_name_clean)
EDIT 1
The next part of my script is:
dfa_1 = pd.read_csv(fileloc1, delimiter='\t')
dfa_nodupli = dfa_1.drop_duplicates(cols='N', take_last=False)
dfa_nodu_2pep = dfa_nodupli[(dfa_nodupli['Peptides(95%)'] > 1)]
dfa_nodu_2pep = dfa_nodu_2pep[~dfa_nodu_2pep['Name'].str.contains('Keratin')]
dfa_nodu_2pep.to_csv(fileloc1_name + ".csv")
dfb_1 = pd.read_csv(fileloc2, delimiter='\t')
dfb_nodupli = dfb_1.drop_duplicates(cols='N', take_last=False)
dfb_nodu_2pep = dfb_nodupli[(dfb_nodupli['Peptides(95%)'] > 1)]
dfb_nodu_2pep = dfb_nodu_2pep[~dfb_nodu_2pep['Name'].str.contains('Keratin')]
dfb_nodu_2pep.to_csv(fileloc2_name + ".csv")
I modified your code, so that it works, in a way you want it ( I hope).
import Tkinter
import tkFileDialog
import tkSimpleDialog
from tkFileDialog import askopenfilename
import os
# Set up GUI
def main():
root = Tkinter.Tk();
root.withdraw()
# Prompt for user info
File_number = tkSimpleDialog.askinteger("File number",
"How many files do you want to compare?")
if not File_number:
return
user_fiels = []
max_file_no = int(File_number)
current_file = 1;
while(current_file <= max_file_no):
fileloc = tkFileDialog.askopenfilename(parent=root, title='Choose file {}'.format(current_file))
if not fileloc:
continue
fileloc_name_clean = os.path.splitext(fileloc)[0]
fileloc_name = os.path.basename(fileloc_name_clean)
user_fiels.append([fileloc, fileloc_name_clean, fileloc_name])
current_file += 1
#print(fileloc_name_clean, fileloc_name)
print(user_fiels)
main()
I use while loop to get file paths as many times as you want.

Categories

Resources